howard.objects.variants

    1import csv
    2import gc
    3import gzip
    4import io
    5import multiprocessing
    6import os
    7import random
    8import re
    9import shlex
   10import sqlite3
   11import subprocess
   12from tempfile import NamedTemporaryFile, TemporaryDirectory
   13import tempfile
   14import duckdb
   15import json
   16import yaml
   17import argparse
   18import Bio.bgzf as bgzf
   19import pandas as pd
   20from pyfaidx import Fasta
   21import numpy as np
   22import vcf
   23import logging as log
   24import fastparquet as fp
   25from multiprocesspandas import applyparallel
   26import cyvcf2
   27import pyBigWig
   28
   29from howard.functions.commons import *
   30from howard.objects.database import *
   31from howard.functions.databases import *
   32from howard.functions.utils import *
   33
   34
   35class Variants:
   36
   37    def __init__(
   38        self,
   39        conn=None,
   40        input: str = None,
   41        output: str = None,
   42        config: dict = {},
   43        param: dict = {},
   44        load: bool = False,
   45    ) -> None:
   46        """
   47        The function `__init__` initializes the variables, sets the input, output, config, param, connexion and
   48        header
   49
   50        :param conn: the connection to the database
   51        :param input: the input file
   52        :param output: the output file
   53        :param config: a dictionary containing the configuration of the model
   54        :param param: a dictionary containing the parameters of the model
   55        """
   56
   57        # Init variables
   58        self.init_variables()
   59
   60        # Input
   61        self.set_input(input)
   62
   63        # Config
   64        self.set_config(config)
   65
   66        # Param
   67        self.set_param(param)
   68
   69        # Output
   70        self.set_output(output)
   71
   72        # connexion
   73        self.set_connexion(conn)
   74
   75        # Header
   76        self.set_header()
   77
   78        # Samples
   79        self.set_samples()
   80
   81        # Load data
   82        if load:
   83            self.load_data()
   84
   85    def set_samples(self, samples: list = None) -> list:
   86        """
   87        The function `set_samples` sets the samples attribute of an object to a provided list or
   88        retrieves it from a parameter dictionary.
   89
   90        :param samples: The `set_samples` method is a method of a class that takes a list of samples as
   91        input and sets the `samples` attribute of the class to the provided list. If no samples are
   92        provided, it tries to get the samples from the class's parameters using the `get_param` method
   93        :type samples: list
   94        :return: The `samples` list is being returned.
   95        """
   96
   97        if not samples:
   98            samples = self.get_param().get("samples", {}).get("list", None)
   99
  100        self.samples = samples
  101
  102        return samples
  103
  104    def get_samples(self) -> list:
  105        """
  106        This function returns a list of samples.
  107        :return: The `get_samples` method is returning the `samples` attribute of the object.
  108        """
  109
  110        return self.samples
  111
  112    def get_samples_check(self) -> bool:
  113        """
  114        This function returns the value of the "check" key within the "samples" dictionary retrieved
  115        from the parameters.
  116        :return: The method `get_samples_check` is returning the value of the key "check" inside the
  117        "samples" dictionary, which is nested inside the dictionary returned by the `get_param()`
  118        method. If the key "check" is not found, it will return `False`.
  119        """
  120
  121        return self.get_param().get("samples", {}).get("check", True)
  122
  123    def set_input(self, input: str = None) -> None:
  124        """
  125        The function `set_input` takes a file name as input, extracts the name and extension, and sets
  126        attributes in the class accordingly.
  127
  128        :param input: The `set_input` method in the provided code snippet is used to set attributes
  129        related to the input file. Here's a breakdown of the parameters and their usage in the method:
  130        :type input: str
  131        """
  132
  133        if input and not isinstance(input, str):
  134            try:
  135                self.input = input.name
  136            except:
  137                log.error(f"Input file '{input} in bad format")
  138                raise ValueError(f"Input file '{input} in bad format")
  139        else:
  140            self.input = input
  141
  142        # Input format
  143        if input:
  144            input_name, input_extension = os.path.splitext(self.input)
  145            self.input_name = input_name
  146            self.input_extension = input_extension
  147            self.input_format = self.input_extension.replace(".", "")
  148
  149    def set_config(self, config: dict) -> None:
  150        """
  151        The set_config function takes a config object and assigns it as the configuration object for the
  152        class.
  153
  154        :param config: The `config` parameter in the `set_config` function is a dictionary object that
  155        contains configuration settings for the class. When you call the `set_config` function with a
  156        dictionary object as the argument, it will set that dictionary as the configuration object for
  157        the class
  158        :type config: dict
  159        """
  160
  161        self.config = config
  162
  163    def set_param(self, param: dict) -> None:
  164        """
  165        This function sets a parameter object for the class based on the input dictionary.
  166
  167        :param param: The `set_param` method you provided takes a dictionary object as input and sets it
  168        as the `param` attribute of the class instance
  169        :type param: dict
  170        """
  171
  172        self.param = param
  173
  174    def init_variables(self) -> None:
  175        """
  176        This function initializes the variables that will be used in the rest of the class
  177        """
  178
  179        self.prefix = "howard"
  180        self.table_variants = "variants"
  181        self.dataframe = None
  182
  183        self.comparison_map = {
  184            "gt": ">",
  185            "gte": ">=",
  186            "lt": "<",
  187            "lte": "<=",
  188            "equals": "=",
  189            "contains": "SIMILAR TO",
  190        }
  191
  192        self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3}
  193
  194        self.code_type_map_to_sql = {
  195            "Integer": "INTEGER",
  196            "String": "VARCHAR",
  197            "Float": "FLOAT",
  198            "Flag": "VARCHAR",
  199        }
  200
  201        self.index_additionnal_fields = []
  202
  203    def get_indexing(self) -> bool:
  204        """
  205        It returns the value of the key "indexing" in the dictionary. If the key is not present, it
  206        returns False.
  207        :return: The value of the indexing parameter.
  208        """
  209
  210        return self.get_param().get("indexing", False)
  211
  212    def get_connexion_config(self) -> dict:
  213        """
  214        The function `get_connexion_config` returns a dictionary containing the configuration for a
  215        connection, including the number of threads and memory limit.
  216        :return: a dictionary containing the configuration for the Connexion library.
  217        """
  218
  219        # config
  220        config = self.get_config()
  221
  222        # Connexion config
  223        connexion_config = {}
  224        threads = self.get_threads()
  225
  226        # Threads
  227        if threads:
  228            connexion_config["threads"] = threads
  229
  230        # Memory
  231        # if config.get("memory", None):
  232        #     connexion_config["memory_limit"] = config.get("memory")
  233        if self.get_memory():
  234            connexion_config["memory_limit"] = self.get_memory()
  235
  236        # Temporary directory
  237        if config.get("tmp", None):
  238            connexion_config["temp_directory"] = config.get("tmp")
  239
  240        # Access
  241        if config.get("access", None):
  242            access = config.get("access")
  243            if access in ["RO"]:
  244                access = "READ_ONLY"
  245            elif access in ["RW"]:
  246                access = "READ_WRITE"
  247            connexion_db = self.get_connexion_db()
  248            if connexion_db in ":memory:":
  249                access = "READ_WRITE"
  250            connexion_config["access_mode"] = access
  251
  252        return connexion_config
  253
  254    def get_duckdb_settings(self) -> dict:
  255        """
  256        The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a
  257        string.
  258        :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`.
  259        """
  260
  261        # config
  262        config = self.get_config()
  263
  264        # duckdb settings
  265        duckdb_settings_dict = {}
  266        if config.get("duckdb_settings", None):
  267            duckdb_settings = config.get("duckdb_settings")
  268            duckdb_settings = full_path(duckdb_settings)
  269            # duckdb setting is a file
  270            if os.path.exists(duckdb_settings):
  271                with open(duckdb_settings) as json_file:
  272                    duckdb_settings_dict = yaml.safe_load(json_file)
  273            # duckdb settings is a string
  274            else:
  275                duckdb_settings_dict = json.loads(duckdb_settings)
  276
  277        return duckdb_settings_dict
  278
  279    def set_connexion_db(self) -> str:
  280        """
  281        The function `set_connexion_db` returns the appropriate database connection string based on the
  282        input format and connection type.
  283        :return: the value of the variable `connexion_db`.
  284        """
  285
  286        # Default connexion db
  287        default_connexion_db = ":memory:"
  288
  289        # Find connexion db
  290        if self.get_input_format() in ["db", "duckdb"]:
  291            connexion_db = self.get_input()
  292        elif self.get_connexion_type() in ["memory", default_connexion_db, None]:
  293            connexion_db = default_connexion_db
  294        elif self.get_connexion_type() in ["tmpfile"]:
  295            tmp_name = tempfile.mkdtemp(
  296                prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db"
  297            )
  298            connexion_db = f"{tmp_name}/tmp.db"
  299        elif self.get_connexion_type() != "":
  300            connexion_db = self.get_connexion_type()
  301        else:
  302            connexion_db = default_connexion_db
  303
  304        # Set connexion db
  305        self.connexion_db = connexion_db
  306
  307        return connexion_db
  308
  309    def set_connexion(self, conn) -> None:
  310        """
  311        The function `set_connexion` creates a connection to a database, with options for different
  312        database formats and settings.
  313
  314        :param conn: The `conn` parameter in the `set_connexion` method is the connection to the
  315        database. If a connection is not provided, a new connection to an in-memory database is created.
  316        The method then proceeds to set up the connection based on the specified format (e.g., duckdb or
  317        sqlite
  318        """
  319
  320        # Connexion db
  321        connexion_db = self.set_connexion_db()
  322
  323        # Connexion config
  324        connexion_config = self.get_connexion_config()
  325
  326        # Connexion format
  327        connexion_format = self.get_config().get("connexion_format", "duckdb")
  328        # Set connexion format
  329        self.connexion_format = connexion_format
  330
  331        # Connexion
  332        if not conn:
  333            if connexion_format in ["duckdb"]:
  334                conn = duckdb.connect(connexion_db, config=connexion_config)
  335                # duckDB settings
  336                duckdb_settings = self.get_duckdb_settings()
  337                if duckdb_settings:
  338                    for setting in duckdb_settings:
  339                        setting_value = duckdb_settings.get(setting)
  340                        if isinstance(setting_value, str):
  341                            setting_value = f"'{setting_value}'"
  342                        conn.execute(f"PRAGMA {setting}={setting_value};")
  343            elif connexion_format in ["sqlite"]:
  344                conn = sqlite3.connect(connexion_db)
  345
  346        # Set connexion
  347        self.conn = conn
  348
  349        # Log
  350        log.debug(f"connexion_format: {connexion_format}")
  351        log.debug(f"connexion_db: {connexion_db}")
  352        log.debug(f"connexion config: {connexion_config}")
  353        log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}")
  354
  355    def set_output(self, output: str = None) -> None:
  356        """
  357        The `set_output` function in Python sets the output file based on the input or a specified key
  358        in the config file, extracting the output name, extension, and format.
  359
  360        :param output: The `output` parameter in the `set_output` method is used to specify the name of
  361        the output file. If the config file has an 'output' key, the method sets the output to the value
  362        of that key. If no output is provided, it sets the output to `None`
  363        :type output: str
  364        """
  365
  366        if output and not isinstance(output, str):
  367            self.output = output.name
  368        else:
  369            self.output = output
  370
  371        # Output format
  372        if self.output:
  373            output_name, output_extension = os.path.splitext(self.output)
  374            self.output_name = output_name
  375            self.output_extension = output_extension
  376            self.output_format = self.output_extension.replace(".", "")
  377        else:
  378            self.output_name = None
  379            self.output_extension = None
  380            self.output_format = None
  381
  382    def set_header(self) -> None:
  383        """
  384        It reads the header of a VCF file and stores it as a list of strings and as a VCF object
  385        """
  386
  387        input_file = self.get_input()
  388        default_header_list = [
  389            "##fileformat=VCFv4.2",
  390            "#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO",
  391        ]
  392
  393        # Full path
  394        input_file = full_path(input_file)
  395
  396        if input_file:
  397
  398            input_format = self.get_input_format()
  399            input_compressed = self.get_input_compressed()
  400            config = self.get_config()
  401            header_list = default_header_list
  402            if input_format in [
  403                "vcf",
  404                "hdr",
  405                "tsv",
  406                "csv",
  407                "psv",
  408                "parquet",
  409                "db",
  410                "duckdb",
  411            ]:
  412                # header provided in param
  413                if config.get("header_file", None):
  414                    with open(config.get("header_file"), "rt") as f:
  415                        header_list = self.read_vcf_header(f)
  416                # within a vcf file format (header within input file itsself)
  417                elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file):
  418                    # within a compressed vcf file format (.vcf.gz)
  419                    if input_compressed:
  420                        with bgzf.open(input_file, "rt") as f:
  421                            header_list = self.read_vcf_header(f)
  422                    # within an uncompressed vcf file format (.vcf)
  423                    else:
  424                        with open(input_file, "rt") as f:
  425                            header_list = self.read_vcf_header(f)
  426                # header provided in default external file .hdr
  427                elif os.path.exists((input_file + ".hdr")):
  428                    with open(input_file + ".hdr", "rt") as f:
  429                        header_list = self.read_vcf_header(f)
  430                else:
  431                    try:  # Try to get header info fields and file columns
  432
  433                        with tempfile.TemporaryDirectory() as tmpdir:
  434
  435                            # Create database
  436                            db_for_header = Database(database=input_file)
  437
  438                            # Get header columns for infos fields
  439                            db_header_from_columns = (
  440                                db_for_header.get_header_from_columns()
  441                            )
  442
  443                            # Get real columns in the file
  444                            db_header_columns = db_for_header.get_columns()
  445
  446                            # Write header file
  447                            header_file_tmp = os.path.join(tmpdir, "header")
  448                            f = open(header_file_tmp, "w")
  449                            vcf.Writer(f, db_header_from_columns)
  450                            f.close()
  451
  452                            # Replace #CHROM line with rel columns
  453                            header_list = db_for_header.read_header_file(
  454                                header_file=header_file_tmp
  455                            )
  456                            header_list[-1] = "\t".join(db_header_columns)
  457
  458                    except:
  459
  460                        log.warning(
  461                            f"No header for file {input_file}. Set as default VCF header"
  462                        )
  463                        header_list = default_header_list
  464
  465            else:  # try for unknown format ?
  466
  467                log.error(f"Input file format '{input_format}' not available")
  468                raise ValueError(f"Input file format '{input_format}' not available")
  469
  470            if not header_list:
  471                header_list = default_header_list
  472
  473            # header as list
  474            self.header_list = header_list
  475
  476            # header as VCF object
  477            self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list)))
  478
  479        else:
  480
  481            self.header_list = None
  482            self.header_vcf = None
  483
  484    def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame:
  485        """
  486        The `get_query_to_df` function takes a query as a string and returns the result as a pandas
  487        DataFrame based on the connection format.
  488
  489        :param query: The `query` parameter in the `get_query_to_df` function is a string that
  490        represents the SQL query you want to execute. This query will be used to fetch data from a
  491        database and convert it into a pandas DataFrame
  492        :type query: str
  493        :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the
  494        maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the
  495        function will only fetch up to that number of rows from the database query result. If no limit
  496        is specified,
  497        :type limit: int
  498        :return: A pandas DataFrame is being returned by the `get_query_to_df` function.
  499        """
  500
  501        # Connexion format
  502        connexion_format = self.get_connexion_format()
  503
  504        # Limit in query
  505        if limit:
  506            pd.set_option("display.max_rows", limit)
  507            if connexion_format in ["duckdb"]:
  508                df = (
  509                    self.conn.execute(query)
  510                    .fetch_record_batch(limit)
  511                    .read_next_batch()
  512                    .to_pandas()
  513                )
  514            elif connexion_format in ["sqlite"]:
  515                df = next(pd.read_sql_query(query, self.conn, chunksize=limit))
  516
  517        # Full query
  518        else:
  519            if connexion_format in ["duckdb"]:
  520                df = self.conn.execute(query).df()
  521            elif connexion_format in ["sqlite"]:
  522                df = pd.read_sql_query(query, self.conn)
  523
  524        return df
  525
  526    def get_overview(self) -> None:
  527        """
  528        The function prints the input, output, config, and dataframe of the current object
  529        """
  530        table_variants_from = self.get_table_variants(clause="from")
  531        sql_columns = self.get_header_columns_as_sql()
  532        sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}"
  533        df = self.get_query_to_df(sql_query_export)
  534        log.info(
  535            "Input:  "
  536            + str(self.get_input())
  537            + " ["
  538            + str(str(self.get_input_format()))
  539            + "]"
  540        )
  541        log.info(
  542            "Output: "
  543            + str(self.get_output())
  544            + " ["
  545            + str(str(self.get_output_format()))
  546            + "]"
  547        )
  548        log.info("Config: ")
  549        for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split(
  550            "\n"
  551        ):
  552            log.info("\t" + str(d))
  553        log.info("Param: ")
  554        for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split(
  555            "\n"
  556        ):
  557            log.info("\t" + str(d))
  558        log.info("Sample list: " + str(self.get_header_sample_list()))
  559        log.info("Dataframe: ")
  560        for d in str(df).split("\n"):
  561            log.info("\t" + str(d))
  562
  563        # garbage collector
  564        del df
  565        gc.collect()
  566
  567        return None
  568
  569    def get_stats(self) -> dict:
  570        """
  571        The `get_stats` function calculates and returns various statistics of the current object,
  572        including information about the input file, variants, samples, header fields, quality, and
  573        SNVs/InDels.
  574        :return: a dictionary containing various statistics of the current object. The dictionary has
  575        the following structure:
  576        """
  577
  578        # Log
  579        log.info(f"Stats Calculation...")
  580
  581        # table varaints
  582        table_variants_from = self.get_table_variants()
  583
  584        # stats dict
  585        stats = {"Infos": {}}
  586
  587        ### File
  588        input_file = self.get_input()
  589        stats["Infos"]["Input file"] = input_file
  590
  591        # Header
  592        header_infos = self.get_header().infos
  593        header_formats = self.get_header().formats
  594        header_infos_list = list(header_infos)
  595        header_formats_list = list(header_formats)
  596
  597        ### Variants
  598
  599        stats["Variants"] = {}
  600
  601        # Variants by chr
  602        sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"'
  603        df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom)
  604        nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values(
  605            by=["CHROM"], kind="quicksort"
  606        )
  607
  608        # Total number of variants
  609        nb_of_variants = nb_of_variants_by_chrom["count"].sum()
  610
  611        # Calculate percentage
  612        nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply(
  613            lambda x: (x / nb_of_variants)
  614        )
  615
  616        stats["Variants"]["Number of variants by chromosome"] = (
  617            nb_of_variants_by_chrom.to_dict(orient="index")
  618        )
  619
  620        stats["Infos"]["Number of variants"] = int(nb_of_variants)
  621
  622        ### Samples
  623
  624        # Init
  625        samples = {}
  626        nb_of_samples = 0
  627
  628        # Check Samples
  629        if "GT" in header_formats_list and "FORMAT" in self.get_header_columns():
  630            log.debug(f"Check samples...")
  631            for sample in self.get_header_sample_list():
  632                sql_query_samples = f"""
  633                    SELECT  '{sample}' as sample,
  634                            REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype,
  635                            count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count,
  636                            concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage
  637                    FROM {table_variants_from}
  638                    WHERE (
  639                        regexp_matches("{sample}", '^[0-9]([/|][0-9])+')
  640                        AND
  641                        len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':'))
  642                      )
  643                    GROUP BY genotype
  644                    """
  645                sql_query_genotype_df = self.conn.execute(sql_query_samples).df()
  646                sample_genotype_count = sql_query_genotype_df["count"].sum()
  647                if len(sql_query_genotype_df):
  648                    nb_of_samples += 1
  649                    samples[f"{sample} - {sample_genotype_count} variants"] = (
  650                        sql_query_genotype_df.to_dict(orient="index")
  651                    )
  652
  653            stats["Samples"] = samples
  654            stats["Infos"]["Number of samples"] = nb_of_samples
  655
  656        # #
  657        # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list:
  658        #     stats["Infos"]["Number of samples"] = nb_of_samples
  659        # elif nb_of_samples:
  660        #     stats["Infos"]["Number of samples"] = "not a VCF format"
  661
  662        ### INFO and FORMAT fields
  663        header_types_df = {}
  664        header_types_list = {
  665            "List of INFO fields": header_infos,
  666            "List of FORMAT fields": header_formats,
  667        }
  668        i = 0
  669        for header_type in header_types_list:
  670
  671            header_type_infos = header_types_list.get(header_type)
  672            header_infos_dict = {}
  673
  674            for info in header_type_infos:
  675
  676                i += 1
  677                header_infos_dict[i] = {}
  678
  679                # ID
  680                header_infos_dict[i]["id"] = info
  681
  682                # num
  683                genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"}
  684                if header_type_infos[info].num in genotype_map.keys():
  685                    header_infos_dict[i]["Number"] = genotype_map.get(
  686                        header_type_infos[info].num
  687                    )
  688                else:
  689                    header_infos_dict[i]["Number"] = header_type_infos[info].num
  690
  691                # type
  692                if header_type_infos[info].type:
  693                    header_infos_dict[i]["Type"] = header_type_infos[info].type
  694                else:
  695                    header_infos_dict[i]["Type"] = "."
  696
  697                # desc
  698                if header_type_infos[info].desc != None:
  699                    header_infos_dict[i]["Description"] = header_type_infos[info].desc
  700                else:
  701                    header_infos_dict[i]["Description"] = ""
  702
  703            if len(header_infos_dict):
  704                header_types_df[header_type] = pd.DataFrame.from_dict(
  705                    header_infos_dict, orient="index"
  706                ).to_dict(orient="index")
  707
  708        # Stats
  709        stats["Infos"]["Number of INFO fields"] = len(header_infos_list)
  710        stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list)
  711        stats["Header"] = header_types_df
  712
  713        ### QUAL
  714        if "QUAL" in self.get_header_columns():
  715            sql_query_qual = f"""
  716                    SELECT
  717                        avg(CAST(QUAL AS INTEGER)) AS Average,
  718                        min(CAST(QUAL AS INTEGER)) AS Minimum,
  719                        max(CAST(QUAL AS INTEGER)) AS Maximum,
  720                        stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation,
  721                        median(CAST(QUAL AS INTEGER)) AS Median,
  722                        variance(CAST(QUAL AS INTEGER)) AS Variance
  723                    FROM {table_variants_from}
  724                    WHERE CAST(QUAL AS VARCHAR) NOT IN ('.')
  725                    """
  726
  727            qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index")
  728            stats["Quality"] = {"Stats": qual}
  729
  730        ### SNV and InDel
  731
  732        sql_query_snv = f"""
  733            
  734            SELECT Type, count FROM (
  735
  736                    SELECT
  737                        'Total' AS Type,
  738                        count(*) AS count
  739                    FROM {table_variants_from}
  740
  741                    UNION
  742
  743                    SELECT
  744                        'MNV' AS Type,
  745                        count(*) AS count
  746                    FROM {table_variants_from}
  747                    WHERE len(REF) > 1 AND len(ALT) > 1
  748                    AND len(REF) = len(ALT)
  749
  750                    UNION
  751
  752                    SELECT
  753                        'InDel' AS Type,
  754                        count(*) AS count
  755                    FROM {table_variants_from}
  756                    WHERE len(REF) > 1 OR len(ALT) > 1
  757                    AND len(REF) != len(ALT)
  758                    
  759                    UNION
  760
  761                    SELECT
  762                        'SNV' AS Type,
  763                        count(*) AS count
  764                    FROM {table_variants_from}
  765                    WHERE len(REF) = 1 AND len(ALT) = 1
  766
  767                )
  768
  769            ORDER BY count DESC
  770
  771                """
  772        snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index")
  773
  774        sql_query_snv_substitution = f"""
  775                SELECT
  776                    concat(REF, '>', ALT) AS 'Substitution',
  777                    count(*) AS count
  778                FROM {table_variants_from}
  779                WHERE len(REF) = 1 AND len(ALT) = 1
  780                GROUP BY REF, ALT
  781                ORDER BY count(*) DESC
  782                """
  783        snv_substitution = (
  784            self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index")
  785        )
  786        stats["Variants"]["Counts"] = snv_indel
  787        stats["Variants"]["Substitutions"] = snv_substitution
  788
  789        return stats
  790
  791    def stats_to_file(self, file: str = None) -> str:
  792        """
  793        The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them
  794        into a JSON object, and writes the JSON object to the specified file.
  795
  796        :param file: The `file` parameter is a string that represents the file path where the JSON data
  797        will be written
  798        :type file: str
  799        :return: the name of the file that was written to.
  800        """
  801
  802        # Get stats
  803        stats = self.get_stats()
  804
  805        # Serializing json
  806        json_object = json.dumps(stats, indent=4)
  807
  808        # Writing to sample.json
  809        with open(file, "w") as outfile:
  810            outfile.write(json_object)
  811
  812        return file
  813
  814    def print_stats(self, output_file: str = None, json_file: str = None) -> None:
  815        """
  816        The `print_stats` function generates a markdown file and prints the statistics contained in a
  817        JSON file in a formatted manner.
  818
  819        :param output_file: The `output_file` parameter is a string that specifies the path and filename
  820        of the output file where the stats will be printed in Markdown format. If no `output_file` is
  821        provided, a temporary directory will be created and the stats will be saved in a file named
  822        "stats.md" within that
  823        :type output_file: str
  824        :param json_file: The `json_file` parameter is a string that represents the path to the JSON
  825        file where the statistics will be saved. If no value is provided, a temporary directory will be
  826        created and a default file name "stats.json" will be used
  827        :type json_file: str
  828        :return: The function `print_stats` does not return any value. It has a return type annotation
  829        of `None`.
  830        """
  831
  832        # Full path
  833        output_file = full_path(output_file)
  834        json_file = full_path(json_file)
  835
  836        with tempfile.TemporaryDirectory() as tmpdir:
  837
  838            # Files
  839            if not output_file:
  840                output_file = os.path.join(tmpdir, "stats.md")
  841            if not json_file:
  842                json_file = os.path.join(tmpdir, "stats.json")
  843
  844            # Create folders
  845            if not os.path.exists(os.path.dirname(output_file)):
  846                Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True)
  847            if not os.path.exists(os.path.dirname(json_file)):
  848                Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True)
  849
  850            # Create stats JSON file
  851            stats_file = self.stats_to_file(file=json_file)
  852
  853            # Print stats file
  854            with open(stats_file) as f:
  855                stats = yaml.safe_load(f)
  856
  857            # Output
  858            output_title = []
  859            output_index = []
  860            output = []
  861
  862            # Title
  863            output_title.append("# HOWARD Stats")
  864
  865            # Index
  866            output_index.append("## Index")
  867
  868            # Process sections
  869            for section in stats:
  870                infos = stats.get(section)
  871                section_link = "#" + section.lower().replace(" ", "-")
  872                output.append(f"## {section}")
  873                output_index.append(f"- [{section}]({section_link})")
  874
  875                if len(infos):
  876                    for info in infos:
  877                        try:
  878                            df = pd.DataFrame.from_dict(infos.get(info), orient="index")
  879                            is_df = True
  880                        except:
  881                            try:
  882                                df = pd.DataFrame.from_dict(
  883                                    json.loads((infos.get(info))), orient="index"
  884                                )
  885                                is_df = True
  886                            except:
  887                                is_df = False
  888                        if is_df:
  889                            output.append(f"### {info}")
  890                            info_link = "#" + info.lower().replace(" ", "-")
  891                            output_index.append(f"   - [{info}]({info_link})")
  892                            output.append(f"{df.to_markdown(index=False)}")
  893                        else:
  894                            output.append(f"- {info}: {infos.get(info)}")
  895                else:
  896                    output.append(f"NA")
  897
  898            # Write stats in markdown file
  899            with open(output_file, "w") as fp:
  900                for item in output_title:
  901                    fp.write("%s\n" % item)
  902                for item in output_index:
  903                    fp.write("%s\n" % item)
  904                for item in output:
  905                    fp.write("%s\n" % item)
  906
  907            # Output stats in markdown
  908            print("")
  909            print("\n\n".join(output_title))
  910            print("")
  911            print("\n\n".join(output))
  912            print("")
  913
  914        return None
  915
  916    def get_input(self) -> str:
  917        """
  918        It returns the value of the input variable.
  919        :return: The input is being returned.
  920        """
  921        return self.input
  922
  923    def get_input_format(self, input_file: str = None) -> str:
  924        """
  925        This function returns the format of the input variable, either from the provided input file or
  926        by prompting for input.
  927
  928        :param input_file: The `input_file` parameter in the `get_input_format` method is a string that
  929        represents the file path of the input file. If no `input_file` is provided when calling the
  930        method, it will default to `None`
  931        :type input_file: str
  932        :return: The format of the input variable is being returned.
  933        """
  934
  935        if not input_file:
  936            input_file = self.get_input()
  937        input_format = get_file_format(input_file)
  938        return input_format
  939
  940    def get_input_compressed(self, input_file: str = None) -> str:
  941        """
  942        The function `get_input_compressed` returns the format of the input variable after compressing
  943        it.
  944
  945        :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string
  946        that represents the file path of the input file. If no `input_file` is provided when calling the
  947        method, it will default to `None` and the method will then call `self.get_input()` to
  948        :type input_file: str
  949        :return: The function `get_input_compressed` returns the compressed format of the input
  950        variable.
  951        """
  952
  953        if not input_file:
  954            input_file = self.get_input()
  955        input_compressed = get_file_compressed(input_file)
  956        return input_compressed
  957
  958    def get_output(self) -> str:
  959        """
  960        It returns the output of the neuron.
  961        :return: The output of the neural network.
  962        """
  963
  964        return self.output
  965
  966    def get_output_format(self, output_file: str = None) -> str:
  967        """
  968        The function `get_output_format` returns the format of the input variable or the output file if
  969        provided.
  970
  971        :param output_file: The `output_file` parameter in the `get_output_format` method is a string
  972        that represents the file path of the output file. If no `output_file` is provided when calling
  973        the method, it will default to the output obtained from the `get_output` method of the class
  974        instance. The
  975        :type output_file: str
  976        :return: The format of the input variable is being returned.
  977        """
  978
  979        if not output_file:
  980            output_file = self.get_output()
  981        output_format = get_file_format(output_file)
  982
  983        return output_format
  984
  985    def get_config(self) -> dict:
  986        """
  987        It returns the config
  988        :return: The config variable is being returned.
  989        """
  990        return self.config
  991
  992    def get_param(self) -> dict:
  993        """
  994        It returns the param
  995        :return: The param variable is being returned.
  996        """
  997        return self.param
  998
  999    def get_connexion_db(self) -> str:
 1000        """
 1001        It returns the connexion_db attribute of the object
 1002        :return: The connexion_db is being returned.
 1003        """
 1004        return self.connexion_db
 1005
 1006    def get_prefix(self) -> str:
 1007        """
 1008        It returns the prefix of the object.
 1009        :return: The prefix is being returned.
 1010        """
 1011        return self.prefix
 1012
 1013    def get_table_variants(self, clause: str = "select") -> str:
 1014        """
 1015        This function returns the table_variants attribute of the object
 1016
 1017        :param clause: the type of clause the table will be used. Either "select" or "from" (optional),
 1018        defaults to select (optional)
 1019        :return: The table_variants attribute of the object.
 1020        """
 1021
 1022        # Access
 1023        access = self.get_config().get("access", None)
 1024
 1025        # Clauses "select", "where", "update"
 1026        if clause in ["select", "where", "update"]:
 1027            table_variants = self.table_variants
 1028        # Clause "from"
 1029        elif clause in ["from"]:
 1030            # For Read Only
 1031            if self.get_input_format() in ["parquet"] and access in ["RO"]:
 1032                input_file = self.get_input()
 1033                table_variants = f"'{input_file}' as variants"
 1034            # For Read Write
 1035            else:
 1036                table_variants = f"{self.table_variants} as variants"
 1037        else:
 1038            table_variants = self.table_variants
 1039        return table_variants
 1040
 1041    def get_tmp_dir(self) -> str:
 1042        """
 1043        The function `get_tmp_dir` returns the temporary directory path based on configuration
 1044        parameters or a default path.
 1045        :return: The `get_tmp_dir` method is returning the temporary directory path based on the
 1046        configuration, parameters, and a default value of "/tmp".
 1047        """
 1048
 1049        return get_tmp(
 1050            config=self.get_config(), param=self.get_param(), default_tmp="/tmp"
 1051        )
 1052
 1053    def get_connexion_type(self) -> str:
 1054        """
 1055        If the connexion type is not in the list of allowed connexion types, raise a ValueError
 1056
 1057        :return: The connexion type is being returned.
 1058        """
 1059        return self.get_config().get("connexion_type", "memory")
 1060
 1061    def get_connexion(self):
 1062        """
 1063        It returns the connection object
 1064
 1065        :return: The connection object.
 1066        """
 1067        return self.conn
 1068
 1069    def close_connexion(self) -> None:
 1070        """
 1071        This function closes the connection to the database.
 1072        :return: The connection is being closed.
 1073        """
 1074        return self.conn.close()
 1075
 1076    def get_header(self, type: str = "vcf"):
 1077        """
 1078        This function returns the header of the VCF file as a list of strings
 1079
 1080        :param type: the type of header you want to get, defaults to vcf (optional)
 1081        :return: The header of the vcf file.
 1082        """
 1083
 1084        if self.header_vcf:
 1085            if type == "vcf":
 1086                return self.header_vcf
 1087            elif type == "list":
 1088                return self.header_list
 1089        else:
 1090            if type == "vcf":
 1091                header = vcf.Reader(io.StringIO("\n".join(vcf_required)))
 1092                return header
 1093            elif type == "list":
 1094                return vcf_required
 1095
 1096    def get_header_infos_list(self) -> list:
 1097        """
 1098        This function retrieves a list of information fields from the header.
 1099        :return: A list of information fields from the header.
 1100        """
 1101
 1102        # Init
 1103        infos_list = []
 1104
 1105        for field in self.get_header().infos:
 1106            infos_list.append(field)
 1107
 1108        return infos_list
 1109
 1110    def get_header_length(self, file: str = None) -> int:
 1111        """
 1112        The function `get_header_length` returns the length of the header list, excluding the #CHROM
 1113        line.
 1114
 1115        :param file: The `file` parameter is an optional argument that specifies the path to a VCF
 1116        header file. If this argument is provided, the function will read the header from the specified
 1117        file and return the length of the header list minus 1 (to exclude the #CHROM line)
 1118        :type file: str
 1119        :return: the length of the header list, excluding the #CHROM line.
 1120        """
 1121
 1122        if file:
 1123            return len(self.read_vcf_header_file(file=file)) - 1
 1124        elif self.get_header(type="list"):
 1125            return len(self.get_header(type="list")) - 1
 1126        else:
 1127            return 0
 1128
 1129    def get_header_columns(self) -> str:
 1130        """
 1131        This function returns the header list of a VCF
 1132
 1133        :return: The length of the header list.
 1134        """
 1135        if self.get_header():
 1136            return self.get_header(type="list")[-1]
 1137        else:
 1138            return ""
 1139
 1140    def get_header_columns_as_list(self) -> list:
 1141        """
 1142        This function returns the header list of a VCF
 1143
 1144        :return: The length of the header list.
 1145        """
 1146        if self.get_header():
 1147            return self.get_header_columns().strip().split("\t")
 1148        else:
 1149            return []
 1150
 1151    def get_header_columns_as_sql(self) -> str:
 1152        """
 1153        This function retruns header length (without #CHROM line)
 1154
 1155        :return: The length of the header list.
 1156        """
 1157        sql_column_list = []
 1158        for col in self.get_header_columns_as_list():
 1159            sql_column_list.append(f'"{col}"')
 1160        return ",".join(sql_column_list)
 1161
 1162    def get_header_sample_list(
 1163        self, check: bool = False, samples: list = None, samples_force: bool = False
 1164    ) -> list:
 1165        """
 1166        The function `get_header_sample_list` returns a list of samples from a VCF header, with optional
 1167        checking and filtering based on input parameters.
 1168
 1169        :param check: The `check` parameter in the `get_header_sample_list` function is a boolean
 1170        parameter that determines whether to check if the samples in the list are properly defined as
 1171        genotype columns. If `check` is set to `True`, the function will verify if each sample in the
 1172        list is defined as a, defaults to False
 1173        :type check: bool (optional)
 1174        :param samples: The `samples` parameter in the `get_header_sample_list` function is a list that
 1175        allows you to specify a subset of samples from the header. If you provide a list of sample
 1176        names, the function will check if each sample is defined in the header. If a sample is not found
 1177        in the
 1178        :type samples: list
 1179        :param samples_force: The `samples_force` parameter in the `get_header_sample_list` function is
 1180        a boolean parameter that determines whether to force the function to return the sample list
 1181        without checking if the samples are genotype columns. If `samples_force` is set to `True`, the
 1182        function will return the sample list without performing, defaults to False
 1183        :type samples_force: bool (optional)
 1184        :return: The function `get_header_sample_list` returns a list of samples based on the input
 1185        parameters and conditions specified in the function.
 1186        """
 1187
 1188        # Init
 1189        samples_list = []
 1190
 1191        if samples is None:
 1192            samples_list = self.header_vcf.samples
 1193        else:
 1194            samples_checked = []
 1195            for sample in samples:
 1196                if sample in self.header_vcf.samples:
 1197                    samples_checked.append(sample)
 1198                else:
 1199                    log.warning(f"Sample '{sample}' not defined in header")
 1200            samples_list = samples_checked
 1201
 1202            # Force sample list without checking if is_genotype_column
 1203            if samples_force:
 1204                log.warning(f"Samples {samples_list} not checked if genotypes")
 1205                return samples_list
 1206
 1207        if check:
 1208            samples_checked = []
 1209            for sample in samples_list:
 1210                if self.is_genotype_column(column=sample):
 1211                    samples_checked.append(sample)
 1212                else:
 1213                    log.warning(
 1214                        f"Sample '{sample}' not defined as a sample (genotype not well defined)"
 1215                    )
 1216            samples_list = samples_checked
 1217
 1218        # Return samples list
 1219        return samples_list
 1220
 1221    def is_genotype_column(self, column: str = None) -> bool:
 1222        """
 1223        This function checks if a given column is a genotype column in a database.
 1224
 1225        :param column: The `column` parameter in the `is_genotype_column` method is a string that
 1226        represents the column name in a database table. This method checks if the specified column is a
 1227        genotype column in the database. If a column name is provided, it calls the `is_genotype_column`
 1228        method of
 1229        :type column: str
 1230        :return: The `is_genotype_column` method is returning a boolean value. If the `column` parameter
 1231        is not None, it calls the `is_genotype_column` method of the `Database` class with the specified
 1232        column name and returns the result. If the `column` parameter is None, it returns False.
 1233        """
 1234
 1235        if column is not None:
 1236            return Database(database=self.get_input()).is_genotype_column(column=column)
 1237        else:
 1238            return False
 1239
 1240    def get_verbose(self) -> bool:
 1241        """
 1242        It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't
 1243        exist
 1244
 1245        :return: The value of the key "verbose" in the config dictionary.
 1246        """
 1247        return self.get_config().get("verbose", False)
 1248
 1249    def get_connexion_format(self) -> str:
 1250        """
 1251        It returns the connexion format of the object.
 1252        :return: The connexion_format is being returned.
 1253        """
 1254        connexion_format = self.connexion_format
 1255        if connexion_format not in ["duckdb", "sqlite"]:
 1256            log.error(f"Unknown connexion format {connexion_format}")
 1257            raise ValueError(f"Unknown connexion format {connexion_format}")
 1258        else:
 1259            return connexion_format
 1260
 1261    def insert_file_to_table(
 1262        self,
 1263        file,
 1264        columns: str,
 1265        header_len: int = 0,
 1266        sep: str = "\t",
 1267        chunksize: int = 1000000,
 1268    ) -> None:
 1269        """
 1270        The function reads a file in chunks and inserts each chunk into a table based on the specified
 1271        database format.
 1272
 1273        :param file: The `file` parameter is the file that you want to load into a table. It should be
 1274        the path to the file on your system
 1275        :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that
 1276        should contain the names of the columns in the table where the data will be inserted. The column
 1277        names should be separated by commas within the string. For example, if you have columns named
 1278        "id", "name
 1279        :type columns: str
 1280        :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies
 1281        the number of lines to skip at the beginning of the file before reading the actual data. This
 1282        parameter allows you to skip any header information present in the file before processing the
 1283        data, defaults to 0
 1284        :type header_len: int (optional)
 1285        :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the
 1286        separator character that is used in the file being read. In this case, the default separator is
 1287        set to `\t`, which represents a tab character. You can change this parameter to a different
 1288        separator character if, defaults to \t
 1289        :type sep: str (optional)
 1290        :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time
 1291        when processing the file in chunks. In the provided code snippet, the default value for
 1292        `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults
 1293        to 1000000
 1294        :type chunksize: int (optional)
 1295        """
 1296
 1297        # Config
 1298        chunksize = self.get_config().get("load", {}).get("chunk", chunksize)
 1299        connexion_format = self.get_connexion_format()
 1300
 1301        log.debug("chunksize: " + str(chunksize))
 1302
 1303        if chunksize:
 1304            for chunk in pd.read_csv(
 1305                file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c"
 1306            ):
 1307                if connexion_format in ["duckdb"]:
 1308                    sql_insert_into = (
 1309                        f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk"
 1310                    )
 1311                    self.conn.execute(sql_insert_into)
 1312                elif connexion_format in ["sqlite"]:
 1313                    chunk.to_sql("variants", self.conn, if_exists="append", index=False)
 1314
 1315    def load_data(
 1316        self,
 1317        input_file: str = None,
 1318        drop_variants_table: bool = False,
 1319        sample_size: int = 20480,
 1320    ) -> None:
 1321        """
 1322        The `load_data` function reads a VCF file and inserts it into a table, with options to drop the
 1323        table before loading the data and specify a sample size.
 1324
 1325        :param input_file: The path to the input file. This is the VCF file that will be loaded into the
 1326        table
 1327        :type input_file: str
 1328        :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that
 1329        determines whether the variants table should be dropped before loading the data. If set to
 1330        `True`, the variants table will be dropped. If set to `False` (default), the variants table will
 1331        not be dropped, defaults to False
 1332        :type drop_variants_table: bool (optional)
 1333        :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from
 1334        the input file. If it is set to `None`, the default value of 20480 will be used, defaults to
 1335        20480
 1336        :type sample_size: int (optional)
 1337        """
 1338
 1339        log.info("Loading...")
 1340
 1341        # change input file
 1342        if input_file:
 1343            self.set_input(input_file)
 1344            self.set_header()
 1345
 1346        # drop variants table
 1347        if drop_variants_table:
 1348            self.drop_variants_table()
 1349
 1350        # get table variants
 1351        table_variants = self.get_table_variants()
 1352
 1353        # Access
 1354        access = self.get_config().get("access", None)
 1355        log.debug(f"access: {access}")
 1356
 1357        # Input format and compress
 1358        input_format = self.get_input_format()
 1359        input_compressed = self.get_input_compressed()
 1360        log.debug(f"input_format: {input_format}")
 1361        log.debug(f"input_compressed: {input_compressed}")
 1362
 1363        # input_compressed_format
 1364        if input_compressed:
 1365            input_compressed_format = "gzip"
 1366        else:
 1367            input_compressed_format = "none"
 1368        log.debug(f"input_compressed_format: {input_compressed_format}")
 1369
 1370        # Connexion format
 1371        connexion_format = self.get_connexion_format()
 1372
 1373        # Sample size
 1374        if not sample_size:
 1375            sample_size = -1
 1376        log.debug(f"sample_size: {sample_size}")
 1377
 1378        # Load data
 1379        log.debug(f"Load Data from {input_format}")
 1380
 1381        # DuckDB connexion
 1382        if connexion_format in ["duckdb"]:
 1383
 1384            # Database already exists
 1385            if self.input_format in ["db", "duckdb"]:
 1386
 1387                if connexion_format in ["duckdb"]:
 1388                    log.debug(f"Input file format '{self.input_format}' duckDB")
 1389                else:
 1390                    log.error(
 1391                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
 1392                    )
 1393                    raise ValueError(
 1394                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
 1395                    )
 1396
 1397            # Load from existing database format
 1398            else:
 1399
 1400                try:
 1401                    # Create Table or View
 1402                    database = Database(database=self.input)
 1403                    sql_from = database.get_sql_from(sample_size=sample_size)
 1404
 1405                    if access in ["RO"]:
 1406                        sql_load = (
 1407                            f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}"
 1408                        )
 1409                    else:
 1410                        sql_load = (
 1411                            f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}"
 1412                        )
 1413                    self.conn.execute(sql_load)
 1414
 1415                except:
 1416                    # Format not available
 1417                    log.error(f"Input file format '{self.input_format}' not available")
 1418                    raise ValueError(
 1419                        f"Input file format '{self.input_format}' not available"
 1420                    )
 1421
 1422        # SQLite connexion
 1423        elif connexion_format in ["sqlite"] and input_format in [
 1424            "vcf",
 1425            "tsv",
 1426            "csv",
 1427            "psv",
 1428        ]:
 1429
 1430            # Main structure
 1431            structure = {
 1432                "#CHROM": "VARCHAR",
 1433                "POS": "INTEGER",
 1434                "ID": "VARCHAR",
 1435                "REF": "VARCHAR",
 1436                "ALT": "VARCHAR",
 1437                "QUAL": "VARCHAR",
 1438                "FILTER": "VARCHAR",
 1439                "INFO": "VARCHAR",
 1440            }
 1441
 1442            # Strcuture with samples
 1443            structure_complete = structure
 1444            if self.get_header_sample_list():
 1445                structure["FORMAT"] = "VARCHAR"
 1446                for sample in self.get_header_sample_list():
 1447                    structure_complete[sample] = "VARCHAR"
 1448
 1449            # Columns list for create and insert
 1450            sql_create_table_columns = []
 1451            sql_create_table_columns_list = []
 1452            for column in structure_complete:
 1453                column_type = structure_complete[column]
 1454                sql_create_table_columns.append(
 1455                    f'"{column}" {column_type} default NULL'
 1456                )
 1457                sql_create_table_columns_list.append(f'"{column}"')
 1458
 1459            # Create database
 1460            log.debug(f"Create Table {table_variants}")
 1461            sql_create_table_columns_sql = ", ".join(sql_create_table_columns)
 1462            sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list)
 1463            sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})"
 1464            self.conn.execute(sql_create_table)
 1465
 1466            # chunksize define length of file chunk load file
 1467            chunksize = 100000
 1468
 1469            # delimiter
 1470            delimiter = file_format_delimiters.get(input_format, "\t")
 1471
 1472            # Load the input file
 1473            with open(self.input, "rt") as input_file:
 1474
 1475                # Use the appropriate file handler based on the input format
 1476                if input_compressed:
 1477                    input_file = bgzf.open(self.input, "rt")
 1478                if input_format in ["vcf"]:
 1479                    header_len = self.get_header_length()
 1480                else:
 1481                    header_len = 0
 1482
 1483                # Insert the file contents into a table
 1484                self.insert_file_to_table(
 1485                    input_file,
 1486                    columns=sql_create_table_columns_list_sql,
 1487                    header_len=header_len,
 1488                    sep=delimiter,
 1489                    chunksize=chunksize,
 1490                )
 1491
 1492        else:
 1493            log.error(
 1494                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
 1495            )
 1496            raise ValueError(
 1497                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
 1498            )
 1499
 1500        # Explode INFOS fields into table fields
 1501        if self.get_explode_infos():
 1502            self.explode_infos(
 1503                prefix=self.get_explode_infos_prefix(),
 1504                fields=self.get_explode_infos_fields(),
 1505                force=True,
 1506            )
 1507
 1508        # Create index after insertion
 1509        self.create_indexes()
 1510
 1511    def get_explode_infos(self) -> bool:
 1512        """
 1513        The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting
 1514        to False if it is not set.
 1515        :return: The method is returning the value of the "explode_infos" parameter, which is a boolean
 1516        value. If the parameter is not present, it will return False.
 1517        """
 1518
 1519        return self.get_param().get("explode", {}).get("explode_infos", False)
 1520
 1521    def get_explode_infos_fields(
 1522        self,
 1523        explode_infos_fields: str = None,
 1524        remove_fields_not_in_header: bool = False,
 1525    ) -> list:
 1526        """
 1527        The `get_explode_infos_fields` function returns a list of exploded information fields based on
 1528        the input parameter `explode_infos_fields`.
 1529
 1530        :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the
 1531        fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a
 1532        comma-separated list of field names to explode
 1533        :type explode_infos_fields: str
 1534        :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean
 1535        flag that determines whether to remove fields that are not present in the header. If it is set
 1536        to `True`, any field that is not in the header will be excluded from the list of exploded
 1537        information fields. If it is set to `, defaults to False
 1538        :type remove_fields_not_in_header: bool (optional)
 1539        :return: The function `get_explode_infos_fields` returns a list of exploded information fields.
 1540        If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty
 1541        list. If the parameter is provided and its value is "ALL", it also returns an empty list.
 1542        Otherwise, it returns a list of exploded information fields after removing any spaces and
 1543        splitting the string by commas.
 1544        """
 1545
 1546        # If no fields, get it in param
 1547        if not explode_infos_fields:
 1548            explode_infos_fields = (
 1549                self.get_param().get("explode", {}).get("explode_infos_fields", None)
 1550            )
 1551
 1552        # If no fields, defined as all fields in header using keyword
 1553        if not explode_infos_fields:
 1554            explode_infos_fields = "*"
 1555
 1556        # If fields list not empty
 1557        if explode_infos_fields:
 1558
 1559            # Input fields list
 1560            if isinstance(explode_infos_fields, str):
 1561                fields_input = explode_infos_fields.split(",")
 1562            elif isinstance(explode_infos_fields, list):
 1563                fields_input = explode_infos_fields
 1564            else:
 1565                fields_input = []
 1566
 1567            # Fields list without * keyword
 1568            fields_without_all = fields_input.copy()
 1569            if "*".casefold() in (item.casefold() for item in fields_without_all):
 1570                fields_without_all.remove("*")
 1571
 1572            # Fields in header
 1573            fields_in_header = sorted(list(set(self.get_header().infos)))
 1574
 1575            # Construct list of fields
 1576            fields_output = []
 1577            for field in fields_input:
 1578
 1579                # Strip field
 1580                field = field.strip()
 1581
 1582                # format keyword * in regex
 1583                if field.upper() in ["*"]:
 1584                    field = ".*"
 1585
 1586                # Find all fields with pattern
 1587                r = re.compile(field)
 1588                fields_search = sorted(list(filter(r.match, fields_in_header)))
 1589
 1590                # Remove fields input from search
 1591                if field in fields_search:
 1592                    fields_search = [field]
 1593                elif fields_search != [field]:
 1594                    fields_search = sorted(
 1595                        list(set(fields_search).difference(fields_input))
 1596                    )
 1597
 1598                # If field is not in header (avoid not well formatted header)
 1599                if not fields_search and not remove_fields_not_in_header:
 1600                    fields_search = [field]
 1601
 1602                # Add found fields
 1603                for new_field in fields_search:
 1604                    # Add field, if not already exists, and if it is in header (if asked)
 1605                    if (
 1606                        new_field not in fields_output
 1607                        and (
 1608                            not remove_fields_not_in_header
 1609                            or new_field in fields_in_header
 1610                        )
 1611                        and new_field not in [".*"]
 1612                    ):
 1613                        fields_output.append(new_field)
 1614
 1615            return fields_output
 1616
 1617        else:
 1618
 1619            return []
 1620
 1621    def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str:
 1622        """
 1623        The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or
 1624        the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is
 1625        not provided.
 1626
 1627        :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a
 1628        prefix to be used for exploding or expanding information
 1629        :type explode_infos_prefix: str
 1630        :return: the value of the variable `explode_infos_prefix`.
 1631        """
 1632
 1633        if not explode_infos_prefix:
 1634            explode_infos_prefix = (
 1635                self.get_param().get("explode", {}).get("explode_infos_prefix", "")
 1636            )
 1637
 1638        return explode_infos_prefix
 1639
 1640    def add_column(
 1641        self,
 1642        table_name,
 1643        column_name,
 1644        column_type,
 1645        default_value=None,
 1646        drop: bool = False,
 1647    ) -> dict:
 1648        """
 1649        The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it
 1650        doesn't already exist.
 1651
 1652        :param table_name: The name of the table to which you want to add a column
 1653        :param column_name: The parameter "column_name" is the name of the column that you want to add
 1654        to the table
 1655        :param column_type: The `column_type` parameter specifies the data type of the column that you
 1656        want to add to the table. It should be a string that represents the desired data type, such as
 1657        "INTEGER", "TEXT", "REAL", etc
 1658        :param default_value: The `default_value` parameter is an optional parameter that specifies the
 1659        default value for the newly added column. If a default value is provided, it will be assigned to
 1660        the column for any existing rows that do not have a value for that column
 1661        :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column
 1662        if it already exists in the table. If `drop` is set to `True`, the function will drop the
 1663        existing column before adding the new column. If `drop` is set to `False` (default),, defaults
 1664        to False
 1665        :type drop: bool (optional)
 1666        :return: a boolean value indicating whether the column was successfully added to the table.
 1667        """
 1668
 1669        # added
 1670        added = False
 1671        dropped = False
 1672
 1673        # Check if the column already exists in the table
 1674        query = f""" SELECT * FROM {table_name} LIMIT 0 """
 1675        columns = self.get_query_to_df(query).columns.tolist()
 1676        if column_name.upper() in [c.upper() for c in columns]:
 1677            log.debug(
 1678                f"The {column_name} column already exists in the {table_name} table"
 1679            )
 1680            if drop:
 1681                self.drop_column(table_name=table_name, column_name=column_name)
 1682                dropped = True
 1683            else:
 1684                return None
 1685        else:
 1686            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
 1687
 1688        # Add column in table
 1689        add_column_query = (
 1690            f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """
 1691        )
 1692        if default_value is not None:
 1693            add_column_query += f" DEFAULT {default_value}"
 1694        self.execute_query(add_column_query)
 1695        added = not dropped
 1696        log.debug(
 1697            f"The {column_name} column was successfully added to the {table_name} table"
 1698        )
 1699
 1700        if added:
 1701            added_column = {
 1702                "table_name": table_name,
 1703                "column_name": column_name,
 1704                "column_type": column_type,
 1705                "default_value": default_value,
 1706            }
 1707        else:
 1708            added_column = None
 1709
 1710        return added_column
 1711
 1712    def drop_column(
 1713        self, column: dict = None, table_name: str = None, column_name: str = None
 1714    ) -> bool:
 1715        """
 1716        The `drop_column` function drops a specified column from a given table in a database and returns
 1717        True if the column was successfully dropped, and False if the column does not exist in the
 1718        table.
 1719
 1720        :param column: The `column` parameter is a dictionary that contains information about the column
 1721        you want to drop. It has two keys:
 1722        :type column: dict
 1723        :param table_name: The `table_name` parameter is the name of the table from which you want to
 1724        drop a column
 1725        :type table_name: str
 1726        :param column_name: The `column_name` parameter is the name of the column that you want to drop
 1727        from the table
 1728        :type column_name: str
 1729        :return: a boolean value. It returns True if the column was successfully dropped from the table,
 1730        and False if the column does not exist in the table.
 1731        """
 1732
 1733        # Find column infos
 1734        if column:
 1735            if isinstance(column, dict):
 1736                table_name = column.get("table_name", None)
 1737                column_name = column.get("column_name", None)
 1738            elif isinstance(column, str):
 1739                table_name = self.get_table_variants()
 1740                column_name = column
 1741            else:
 1742                table_name = None
 1743                column_name = None
 1744
 1745        if not table_name and not column_name:
 1746            return False
 1747
 1748        # Removed
 1749        removed = False
 1750
 1751        # Check if the column already exists in the table
 1752        query = f""" SELECT * FROM {table_name} LIMIT 0 """
 1753        columns = self.get_query_to_df(query).columns.tolist()
 1754        if column_name in columns:
 1755            log.debug(f"The {column_name} column exists in the {table_name} table")
 1756        else:
 1757            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
 1758            return False
 1759
 1760        # Add column in table # ALTER TABLE integers DROP k
 1761        add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """
 1762        self.execute_query(add_column_query)
 1763        removed = True
 1764        log.debug(
 1765            f"The {column_name} column was successfully dropped to the {table_name} table"
 1766        )
 1767
 1768        return removed
 1769
 1770    def explode_infos(
 1771        self,
 1772        prefix: str = None,
 1773        create_index: bool = False,
 1774        fields: list = None,
 1775        force: bool = False,
 1776        proccess_all_fields_together: bool = False,
 1777        table: str = None,
 1778    ) -> list:
 1779        """
 1780        The `explode_infos` function in Python takes a VCF file and explodes the INFO fields into
 1781        individual columns, returning a list of added columns.
 1782
 1783        :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO
 1784        fields. If the `prefix` is not provided or is set to `None`, the function will use the value of
 1785        `self.get_explode_infos_prefix()` as the prefix
 1786        :type prefix: str
 1787        :param create_index: The `create_index` parameter is a boolean flag that specifies whether to
 1788        create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to
 1789        `False`, indexes will not be created. The default value is `False`, defaults to False
 1790        :type create_index: bool (optional)
 1791        :param fields: The `fields` parameter in the `explode_infos` function is a list of INFO fields
 1792        that you want to explode into individual columns. If this parameter is not provided, all INFO
 1793        fields will be exploded. You can specify the INFO fields you want to explode by passing them as
 1794        a list to the `
 1795        :type fields: list
 1796        :param force: The `force` parameter in the `explode_infos` function is a boolean flag that
 1797        determines whether to drop and recreate a column if it already exists in the table. If `force`
 1798        is set to `True`, the column will be dropped and recreated. If `force` is set to `False,
 1799        defaults to False
 1800        :type force: bool (optional)
 1801        :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean
 1802        flag that determines whether to process all the INFO fields together or individually. If set to
 1803        `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will
 1804        be processed individually. The default value is, defaults to False
 1805        :type proccess_all_fields_together: bool (optional)
 1806        :param table: The `table` parameter in the `explode_infos` function is used to specify the name
 1807        of the table where the exploded INFO fields will be added as individual columns. If you provide
 1808        a value for the `table` parameter, the function will use that table name. If the `table`
 1809        parameter is
 1810        :type table: str
 1811        :return: The `explode_infos` function returns a list of added columns.
 1812        """
 1813
 1814        # drop indexes
 1815        self.drop_indexes()
 1816
 1817        # connexion format
 1818        connexion_format = self.get_connexion_format()
 1819
 1820        # Access
 1821        access = self.get_config().get("access", None)
 1822
 1823        # Added columns
 1824        added_columns = []
 1825
 1826        if access not in ["RO"]:
 1827
 1828            # prefix
 1829            if prefix in [None, True] or not isinstance(prefix, str):
 1830                if self.get_explode_infos_prefix() not in [None, True]:
 1831                    prefix = self.get_explode_infos_prefix()
 1832                else:
 1833                    prefix = "INFO/"
 1834
 1835            # table variants
 1836            if table is not None:
 1837                table_variants = table
 1838            else:
 1839                table_variants = self.get_table_variants(clause="select")
 1840
 1841            # extra infos
 1842            try:
 1843                extra_infos = self.get_extra_infos()
 1844            except:
 1845                extra_infos = []
 1846
 1847            # Header infos
 1848            header_infos = self.get_header().infos
 1849
 1850            log.debug(
 1851                f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields"
 1852            )
 1853
 1854            sql_info_alter_table_array = []
 1855
 1856            # Info fields to check
 1857            fields_list = list(header_infos)
 1858            if fields:
 1859                fields_list += fields
 1860            fields_list = set(fields_list)
 1861
 1862            # If no fields
 1863            if not fields:
 1864                fields = []
 1865
 1866            # Translate fields if patterns
 1867            fields = self.get_explode_infos_fields(explode_infos_fields=fields)
 1868
 1869            for info in fields:
 1870
 1871                info_id_sql = prefix + info
 1872
 1873                if (
 1874                    info in fields_list
 1875                    or prefix + info in fields_list
 1876                    or info in extra_infos
 1877                ):
 1878
 1879                    log.debug(f"Explode INFO fields - ADD '{info}' annotations fields")
 1880
 1881                    if info in header_infos:
 1882                        info_type = header_infos[info].type
 1883                        info_num = header_infos[info].num
 1884                    else:
 1885                        info_type = "String"
 1886                        info_num = 0
 1887
 1888                    type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR")
 1889                    if info_num != 1:
 1890                        type_sql = "VARCHAR"
 1891
 1892                    # Add field
 1893                    added_column = self.add_column(
 1894                        table_name=table_variants,
 1895                        column_name=info_id_sql,
 1896                        column_type=type_sql,
 1897                        default_value="null",
 1898                        drop=force,
 1899                    )
 1900
 1901                    if added_column:
 1902                        added_columns.append(added_column)
 1903
 1904                    if added_column or force:
 1905
 1906                        # add field to index
 1907                        self.index_additionnal_fields.append(info_id_sql)
 1908
 1909                        # Update field array
 1910                        if connexion_format in ["duckdb"]:
 1911                            update_info_field = f"""
 1912                            "{info_id_sql}" =
 1913                                CASE
 1914                                    WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL
 1915                                    ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1)
 1916                                END
 1917                            """
 1918                        elif connexion_format in ["sqlite"]:
 1919                            update_info_field = f"""
 1920                                "{info_id_sql}" =
 1921                                    CASE
 1922                                        WHEN instr(INFO, '{info}=') = 0 THEN NULL
 1923                                        WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1)
 1924                                        ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1)
 1925                                    END
 1926                            """
 1927
 1928                        sql_info_alter_table_array.append(update_info_field)
 1929
 1930            if sql_info_alter_table_array:
 1931
 1932                # By chromosomes
 1933                try:
 1934                    chromosomes_list = list(
 1935                        self.get_query_to_df(
 1936                            f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """
 1937                        )["#CHROM"]
 1938                    )
 1939                except:
 1940                    chromosomes_list = [None]
 1941
 1942                for chrom in chromosomes_list:
 1943                    log.debug(f"Explode INFO fields - Chromosome {chrom}...")
 1944
 1945                    # Where clause
 1946                    where_clause = ""
 1947                    if chrom and len(chromosomes_list) > 1:
 1948                        where_clause = f""" WHERE "#CHROM" = '{chrom}' """
 1949
 1950                    # Update table
 1951                    if proccess_all_fields_together:
 1952                        sql_info_alter_table_array_join = ", ".join(
 1953                            sql_info_alter_table_array
 1954                        )
 1955                        if sql_info_alter_table_array_join:
 1956                            sql_info_alter_table = f"""
 1957                                UPDATE {table_variants}
 1958                                SET {sql_info_alter_table_array_join}
 1959                                {where_clause}
 1960                                """
 1961                            log.debug(
 1962                                f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..."
 1963                            )
 1964                            # log.debug(sql_info_alter_table)
 1965                            self.conn.execute(sql_info_alter_table)
 1966                    else:
 1967                        sql_info_alter_num = 0
 1968                        for sql_info_alter in sql_info_alter_table_array:
 1969                            sql_info_alter_num += 1
 1970                            sql_info_alter_table = f"""
 1971                                UPDATE {table_variants}
 1972                                SET {sql_info_alter}
 1973                                {where_clause}
 1974                                """
 1975                            log.debug(
 1976                                f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..."
 1977                            )
 1978                            # log.debug(sql_info_alter_table)
 1979                            self.conn.execute(sql_info_alter_table)
 1980
 1981        # create indexes
 1982        if create_index:
 1983            self.create_indexes()
 1984
 1985        return added_columns
 1986
 1987    def create_indexes(self) -> None:
 1988        """
 1989        Create indexes on the table after insertion
 1990        """
 1991
 1992        # Access
 1993        access = self.get_config().get("access", None)
 1994
 1995        # get table variants
 1996        table_variants = self.get_table_variants("FROM")
 1997
 1998        if self.get_indexing() and access not in ["RO"]:
 1999            # Create index
 2000            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")'
 2001            self.conn.execute(sql_create_table_index)
 2002            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")'
 2003            self.conn.execute(sql_create_table_index)
 2004            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")'
 2005            self.conn.execute(sql_create_table_index)
 2006            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")'
 2007            self.conn.execute(sql_create_table_index)
 2008            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")'
 2009            self.conn.execute(sql_create_table_index)
 2010            for field in self.index_additionnal_fields:
 2011                sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """
 2012                self.conn.execute(sql_create_table_index)
 2013
 2014    def drop_indexes(self) -> None:
 2015        """
 2016        Create indexes on the table after insertion
 2017        """
 2018
 2019        # Access
 2020        access = self.get_config().get("access", None)
 2021
 2022        # get table variants
 2023        table_variants = self.get_table_variants("FROM")
 2024
 2025        # Get database format
 2026        connexion_format = self.get_connexion_format()
 2027
 2028        if access not in ["RO"]:
 2029            if connexion_format in ["duckdb"]:
 2030                sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'"
 2031            elif connexion_format in ["sqlite"]:
 2032                sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';"
 2033
 2034            list_indexes = self.conn.execute(sql_list_indexes)
 2035            index_names = [row[0] for row in list_indexes.fetchall()]
 2036            for index in index_names:
 2037                sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """
 2038                self.conn.execute(sql_drop_table_index)
 2039
 2040    def read_vcf_header(self, f) -> list:
 2041        """
 2042        It reads the header of a VCF file and returns a list of the header lines
 2043
 2044        :param f: the file object
 2045        :return: The header lines of the VCF file.
 2046        """
 2047
 2048        header_list = []
 2049        for line in f:
 2050            header_list.append(line)
 2051            if line.startswith("#CHROM"):
 2052                break
 2053        return header_list
 2054
 2055    def read_vcf_header_file(self, file: str = None) -> list:
 2056        """
 2057        The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and
 2058        uncompressed files.
 2059
 2060        :param file: The `file` parameter is a string that represents the path to the VCF header file
 2061        that you want to read. It is an optional parameter, so if you don't provide a value, it will
 2062        default to `None`
 2063        :type file: str
 2064        :return: The function `read_vcf_header_file` returns a list.
 2065        """
 2066
 2067        if self.get_input_compressed(input_file=file):
 2068            with bgzf.open(file, "rt") as f:
 2069                return self.read_vcf_header(f=f)
 2070        else:
 2071            with open(file, "rt") as f:
 2072                return self.read_vcf_header(f=f)
 2073
 2074    def execute_query(self, query: str):
 2075        """
 2076        It takes a query as an argument, executes it, and returns the results
 2077
 2078        :param query: The query to be executed
 2079        :return: The result of the query is being returned.
 2080        """
 2081        if query:
 2082            return self.conn.execute(query)  # .fetchall()
 2083        else:
 2084            return None
 2085
 2086    def export_output(
 2087        self,
 2088        output_file: str | None = None,
 2089        output_header: str | None = None,
 2090        export_header: bool = True,
 2091        query: str | None = None,
 2092        parquet_partitions: list | None = None,
 2093        chunk_size: int | None = None,
 2094        threads: int | None = None,
 2095        sort: bool = False,
 2096        index: bool = False,
 2097        order_by: str | None = None,
 2098    ) -> bool:
 2099        """
 2100        The `export_output` function exports data from a VCF file to a specified output file in various
 2101        formats, including VCF, CSV, TSV, PSV, and Parquet.
 2102
 2103        :param output_file: The `output_file` parameter is a string that specifies the name of the
 2104        output file to be generated by the function. This is where the exported data will be saved
 2105        :type output_file: str
 2106        :param output_header: The `output_header` parameter is a string that specifies the name of the
 2107        file where the header of the VCF file will be exported. If this parameter is not provided, the
 2108        header will be exported to a file with the same name as the `output_file` parameter, but with
 2109        the extension "
 2110        :type output_header: str
 2111        :param export_header: The `export_header` parameter is a boolean flag that determines whether
 2112        the header of a VCF file should be exported to a separate file or not. If `export_header` is
 2113        True, the header will be exported to a file. If `export_header` is False, the header will not
 2114        be, defaults to True, if output format is not VCF
 2115        :type export_header: bool (optional)
 2116        :param query: The `query` parameter is an optional SQL query that can be used to filter and
 2117        select specific data from the VCF file before exporting it. If provided, only the data that
 2118        matches the query will be exported
 2119        :type query: str
 2120        :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the
 2121        columns to be used for partitioning the Parquet file during export. Partitioning is a way to
 2122        organize data in a hierarchical directory structure based on the values of one or more columns.
 2123        This can improve query performance when working with large datasets
 2124        :type parquet_partitions: list
 2125        :param chunk_size: The `chunk_size` parameter specifies the number of
 2126        records in batch when exporting data in Parquet format. This parameter is used for
 2127        partitioning the Parquet file into multiple files.
 2128        :type chunk_size: int
 2129        :param threads: The `threads` parameter is an optional parameter that specifies the number of
 2130        threads to be used during the export process. It determines the level of parallelism and can
 2131        improve the performance of the export operation. If not provided, the function will use the
 2132        default number of threads
 2133        :type threads: int
 2134        :param sort: The `sort` parameter is a boolean flag that determines whether the output file
 2135        should be sorted or not. If `sort` is set to `True`, the output file will be sorted based on the
 2136        genomic coordinates of the variants. By default, the value of `sort` is `False`, defaults to
 2137        False
 2138        :type sort: bool (optional)
 2139        :param index: The `index` parameter is a boolean flag that determines whether an index should be
 2140        created on the output file. If `index` is True, an index will be created. If `index` is False,
 2141        no index will be created. The default value is False, defaults to False
 2142        :type index: bool (optional)
 2143        :param order_by: The `order_by` parameter is a string that specifies the column(s) to use for
 2144        sorting the output file. This parameter is only applicable when exporting data in VCF format
 2145        :type order_by: str
 2146        :return: a boolean value. It checks if the output file exists and returns True if it does, or
 2147        None if it doesn't.
 2148        """
 2149
 2150        # Log
 2151        log.info("Exporting...")
 2152
 2153        # Full path
 2154        output_file = full_path(output_file)
 2155        output_header = full_path(output_header)
 2156
 2157        # Config
 2158        config = self.get_config()
 2159
 2160        # Param
 2161        param = self.get_param()
 2162
 2163        # Tmp files to remove
 2164        tmp_to_remove = []
 2165
 2166        # If no output, get it
 2167        if not output_file:
 2168            output_file = self.get_output()
 2169
 2170        # If not threads
 2171        if not threads:
 2172            threads = self.get_threads()
 2173
 2174        # Auto header name with extension
 2175        if export_header or output_header:
 2176            if not output_header:
 2177                output_header = f"{output_file}.hdr"
 2178            # Export header
 2179            self.export_header(output_file=output_file)
 2180
 2181        # Switch off export header if VCF output
 2182        output_file_type = get_file_format(output_file)
 2183        if output_file_type in ["vcf"]:
 2184            export_header = False
 2185            tmp_to_remove.append(output_header)
 2186
 2187        # Chunk size
 2188        if not chunk_size:
 2189            chunk_size = config.get("chunk_size", None)
 2190
 2191        # Parquet partition
 2192        if not parquet_partitions:
 2193            parquet_partitions = param.get("export", {}).get("parquet_partitions", None)
 2194        if parquet_partitions and isinstance(parquet_partitions, str):
 2195            parquet_partitions = parquet_partitions.split(",")
 2196
 2197        # Order by
 2198        if not order_by:
 2199            order_by = param.get("export", {}).get("order_by", "")
 2200
 2201        # Header in output
 2202        header_in_output = param.get("export", {}).get("include_header", False)
 2203
 2204        # Database
 2205        database_source = self.get_connexion()
 2206
 2207        # Connexion format
 2208        connexion_format = self.get_connexion_format()
 2209
 2210        # Explode infos
 2211        if self.get_explode_infos():
 2212            self.explode_infos(
 2213                prefix=self.get_explode_infos_prefix(),
 2214                fields=self.get_explode_infos_fields(),
 2215                force=False,
 2216            )
 2217
 2218        # if connexion_format in ["sqlite"] or query:
 2219        if connexion_format in ["sqlite"]:
 2220
 2221            # Export in Parquet
 2222            random_tmp = "".join(
 2223                random.choice(string.ascii_lowercase) for i in range(10)
 2224            )
 2225            database_source = f"""{output_file}.{random_tmp}.database_export.parquet"""
 2226            tmp_to_remove.append(database_source)
 2227
 2228            # Table Variants
 2229            table_variants = self.get_table_variants()
 2230
 2231            # Create export query
 2232            sql_query_export_subquery = f"""
 2233                SELECT * FROM {table_variants}
 2234                """
 2235
 2236            # Write source file
 2237            fp.write(database_source, self.get_query_to_df(sql_query_export_subquery))
 2238
 2239        # Create database
 2240        database = Database(
 2241            database=database_source,
 2242            table="variants",
 2243            header_file=output_header,
 2244            conn_config=self.get_connexion_config(),
 2245        )
 2246
 2247        # Existing colomns header
 2248        existing_columns_header = database.get_header_columns_from_database(query=query)
 2249
 2250        # Sample list
 2251        if output_file_type in ["vcf"]:
 2252            get_samples = self.get_samples()
 2253            get_samples_check = self.get_samples_check()
 2254            samples_force = get_samples is not None
 2255            sample_list = self.get_header_sample_list(
 2256                check=get_samples_check,
 2257                samples=get_samples,
 2258                samples_force=samples_force,
 2259            )
 2260        else:
 2261            sample_list = None
 2262
 2263        # Export file
 2264        database.export(
 2265            output_database=output_file,
 2266            output_header=output_header,
 2267            existing_columns_header=existing_columns_header,
 2268            parquet_partitions=parquet_partitions,
 2269            chunk_size=chunk_size,
 2270            threads=threads,
 2271            sort=sort,
 2272            index=index,
 2273            header_in_output=header_in_output,
 2274            order_by=order_by,
 2275            query=query,
 2276            export_header=export_header,
 2277            sample_list=sample_list,
 2278        )
 2279
 2280        # Remove
 2281        remove_if_exists(tmp_to_remove)
 2282
 2283        return (os.path.exists(output_file) or None) and (
 2284            os.path.exists(output_file) or None
 2285        )
 2286
 2287    def get_extra_infos(self, table: str = None) -> list:
 2288        """
 2289        The `get_extra_infos` function returns a list of columns that are in a specified table but not
 2290        in the header.
 2291
 2292        :param table: The `table` parameter in the `get_extra_infos` function is used to specify the
 2293        name of the table from which you want to retrieve the extra columns that are not present in the
 2294        header. If the `table` parameter is not provided when calling the function, it will default to
 2295        using the variants
 2296        :type table: str
 2297        :return: A list of columns that are in the specified table but not in the header of the table.
 2298        """
 2299
 2300        header_columns = []
 2301
 2302        if not table:
 2303            table = self.get_table_variants(clause="from")
 2304            header_columns = self.get_header_columns()
 2305
 2306        # Check all columns in the database
 2307        query = f""" SELECT * FROM {table} LIMIT 1 """
 2308        log.debug(f"query {query}")
 2309        table_columns = self.get_query_to_df(query).columns.tolist()
 2310        extra_columns = []
 2311
 2312        # Construct extra infos (not in header)
 2313        for column in table_columns:
 2314            if column not in header_columns:
 2315                extra_columns.append(column)
 2316
 2317        return extra_columns
 2318
 2319    def get_extra_infos_sql(self, table: str = None) -> str:
 2320        """
 2321        It returns a string of the extra infos, separated by commas, and each extra info is surrounded
 2322        by double quotes
 2323
 2324        :param table: The name of the table to get the extra infos from. If None, the default table is
 2325        used
 2326        :type table: str
 2327        :return: A string of the extra infos
 2328        """
 2329
 2330        return ", ".join(
 2331            ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)]
 2332        )
 2333
 2334    def export_header(
 2335        self,
 2336        header_name: str = None,
 2337        output_file: str = None,
 2338        output_file_ext: str = ".hdr",
 2339        clean_header: bool = True,
 2340        remove_chrom_line: bool = False,
 2341    ) -> str:
 2342        """
 2343        The `export_header` function takes a VCF file, extracts the header, modifies it according to
 2344        specified options, and writes it to a new file.
 2345
 2346        :param header_name: The `header_name` parameter is the name of the header file to be created. If
 2347        this parameter is not specified, the header will be written to the output file
 2348        :type header_name: str
 2349        :param output_file: The `output_file` parameter in the `export_header` function is used to
 2350        specify the name of the output file where the header will be written. If this parameter is not
 2351        provided, the header will be written to a temporary file
 2352        :type output_file: str
 2353        :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a
 2354        string that represents the extension of the output header file. By default, it is set to ".hdr"
 2355        if not specified by the user. This extension will be appended to the `output_file` name to
 2356        create the final, defaults to .hdr
 2357        :type output_file_ext: str (optional)
 2358        :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean
 2359        flag that determines whether the header should be cleaned or not. When `clean_header` is set to
 2360        `True`, the function will clean the header by modifying certain lines based on a specific
 2361        pattern. If `clean_header`, defaults to True
 2362        :type clean_header: bool (optional)
 2363        :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a
 2364        boolean flag that determines whether the #CHROM line should be removed from the header before
 2365        writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `,
 2366        defaults to False
 2367        :type remove_chrom_line: bool (optional)
 2368        :return: The function `export_header` returns the name of the temporary header file that is
 2369        created.
 2370        """
 2371
 2372        if not header_name and not output_file:
 2373            output_file = self.get_output()
 2374
 2375        if self.get_header():
 2376
 2377            # Get header object
 2378            header_obj = self.get_header()
 2379
 2380            # Create database
 2381            db_for_header = Database(database=self.get_input())
 2382
 2383            # Get real columns in the file
 2384            db_header_columns = db_for_header.get_columns()
 2385
 2386            with tempfile.TemporaryDirectory() as tmpdir:
 2387
 2388                # Write header file
 2389                header_file_tmp = os.path.join(tmpdir, "header")
 2390                f = open(header_file_tmp, "w")
 2391                vcf.Writer(f, header_obj)
 2392                f.close()
 2393
 2394                # Replace #CHROM line with rel columns
 2395                header_list = db_for_header.read_header_file(
 2396                    header_file=header_file_tmp
 2397                )
 2398                header_list[-1] = "\t".join(db_header_columns)
 2399
 2400                # Remove CHROM line
 2401                if remove_chrom_line:
 2402                    header_list.pop()
 2403
 2404                # Clean header
 2405                if clean_header:
 2406                    header_list_clean = []
 2407                    for head in header_list:
 2408                        # Clean head for malformed header
 2409                        head_clean = head
 2410                        head_clean = re.subn(
 2411                            "##FORMAT=<ID=(.*),Number=(.*),Type=Flag",
 2412                            r"##FORMAT=<ID=\1,Number=\2,Type=String",
 2413                            head_clean,
 2414                            2,
 2415                        )[0]
 2416                        # Write header
 2417                        header_list_clean.append(head_clean)
 2418                    header_list = header_list_clean
 2419
 2420            tmp_header_name = output_file + output_file_ext
 2421
 2422            f = open(tmp_header_name, "w")
 2423            for line in header_list:
 2424                f.write(line)
 2425            f.close()
 2426
 2427        return tmp_header_name
 2428
 2429    def export_variant_vcf(
 2430        self,
 2431        vcf_file,
 2432        remove_info: bool = False,
 2433        add_samples: bool = True,
 2434        list_samples: list = [],
 2435        where_clause: str = "",
 2436        index: bool = False,
 2437        threads: int | None = None,
 2438    ) -> bool | None:
 2439        """
 2440        The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to
 2441        remove INFO field, add samples, and control compression and indexing.
 2442
 2443        :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be
 2444        written to. It is the output file that will contain the filtered VCF data based on the specified
 2445        parameters
 2446        :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a
 2447        boolean flag that determines whether to remove the INFO field from the output VCF file. If set
 2448        to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included
 2449        in, defaults to False
 2450        :type remove_info: bool (optional)
 2451        :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether
 2452        the samples should be added to the VCF file or not. If set to True, the samples will be added.
 2453        If set to False, the samples will be removed. The default value is True, defaults to True
 2454        :type add_samples: bool (optional)
 2455        :param list_samples: The `list_samples` parameter is a list of samples that you want to include
 2456        in the output VCF file. By default, all samples will be included. If you provide a list of
 2457        samples, only those samples will be included in the output file
 2458        :type list_samples: list
 2459        :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that
 2460        determines whether or not to create an index for the output VCF file. If `index` is set to
 2461        `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False
 2462        :type index: bool (optional)
 2463        :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the
 2464        number of threads to use for exporting the VCF file. It determines how many parallel threads
 2465        will be used during the export process. More threads can potentially speed up the export process
 2466        by utilizing multiple cores of the processor. If
 2467        :type threads: int | None
 2468        :return: The `export_variant_vcf` function returns the result of calling the `export_output`
 2469        method with various parameters including the output file, query, threads, sort flag, and index
 2470        flag. The `export_output` method is responsible for exporting the VCF data based on the
 2471        specified parameters and configurations provided in the `export_variant_vcf` function.
 2472        """
 2473
 2474        # Config
 2475        config = self.get_config()
 2476
 2477        # Extract VCF
 2478        log.debug("Export VCF...")
 2479
 2480        # Table variants
 2481        table_variants = self.get_table_variants()
 2482
 2483        # Threads
 2484        if not threads:
 2485            threads = self.get_threads()
 2486
 2487        # Info fields
 2488        if remove_info:
 2489            if not isinstance(remove_info, str):
 2490                remove_info = "."
 2491            info_field = f"""'{remove_info}' as INFO"""
 2492        else:
 2493            info_field = "INFO"
 2494
 2495        # Samples fields
 2496        if add_samples:
 2497            if not list_samples:
 2498                list_samples = self.get_header_sample_list()
 2499            if list_samples:
 2500                samples_fields = " , FORMAT , " + " , ".join(list_samples)
 2501            else:
 2502                samples_fields = ""
 2503            log.debug(f"samples_fields: {samples_fields}")
 2504        else:
 2505            samples_fields = ""
 2506
 2507        # Where clause
 2508        if where_clause is None:
 2509            where_clause = ""
 2510
 2511        # Variants
 2512        select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """
 2513        sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """
 2514        log.debug(f"sql_query_select={sql_query_select}")
 2515
 2516        return self.export_output(
 2517            output_file=vcf_file,
 2518            output_header=None,
 2519            export_header=True,
 2520            query=sql_query_select,
 2521            parquet_partitions=None,
 2522            chunk_size=config.get("chunk_size", None),
 2523            threads=threads,
 2524            sort=True,
 2525            index=index,
 2526            order_by=None,
 2527        )
 2528
 2529    def run_commands(self, commands: list = [], threads: int = 1) -> None:
 2530        """
 2531        It takes a list of commands and runs them in parallel using the number of threads specified
 2532
 2533        :param commands: A list of commands to run
 2534        :param threads: The number of threads to use, defaults to 1 (optional)
 2535        """
 2536
 2537        run_parallel_commands(commands, threads)
 2538
 2539    def get_threads(self, default: int = 1) -> int:
 2540        """
 2541        This function returns the number of threads to use for a job, with a default value of 1 if not
 2542        specified.
 2543
 2544        :param default: The `default` parameter in the `get_threads` method is used to specify the
 2545        default number of threads to use if no specific value is provided. If no value is provided for
 2546        the `threads` parameter in the configuration or input parameters, the `default` value will be
 2547        used, defaults to 1
 2548        :type default: int (optional)
 2549        :return: the number of threads to use for the current job.
 2550        """
 2551
 2552        # Config
 2553        config = self.get_config()
 2554
 2555        # Param
 2556        param = self.get_param()
 2557
 2558        # Input threads
 2559        input_thread = param.get("threads", config.get("threads", None))
 2560
 2561        # Check threads
 2562        if not input_thread:
 2563            threads = default
 2564        elif int(input_thread) <= 0:
 2565            threads = os.cpu_count()
 2566        else:
 2567            threads = int(input_thread)
 2568        return threads
 2569
 2570    def get_memory(self, default: str = None) -> str:
 2571        """
 2572        This function retrieves the memory value from parameters or configuration with a default value
 2573        if not found.
 2574
 2575        :param default: The `get_memory` function takes in a default value as a string parameter. This
 2576        default value is used as a fallback in case the `memory` parameter is not provided in the
 2577        `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary,
 2578        the function
 2579        :type default: str
 2580        :return: The `get_memory` function returns a string value representing the memory parameter. If
 2581        the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will
 2582        return the default value provided as an argument to the function.
 2583        """
 2584
 2585        # Config
 2586        config = self.get_config()
 2587
 2588        # Param
 2589        param = self.get_param()
 2590
 2591        # Input threads
 2592        input_memory = param.get("memory", config.get("memory", None))
 2593
 2594        # Check threads
 2595        if input_memory:
 2596            memory = input_memory
 2597        else:
 2598            memory = default
 2599
 2600        return memory
 2601
 2602    def update_from_vcf(self, vcf_file: str) -> None:
 2603        """
 2604        > If the database is duckdb, then use the parquet method, otherwise use the sqlite method
 2605
 2606        :param vcf_file: the path to the VCF file
 2607        """
 2608
 2609        connexion_format = self.get_connexion_format()
 2610
 2611        if connexion_format in ["duckdb"]:
 2612            self.update_from_vcf_duckdb(vcf_file)
 2613        elif connexion_format in ["sqlite"]:
 2614            self.update_from_vcf_sqlite(vcf_file)
 2615
 2616    def update_from_vcf_duckdb(self, vcf_file: str) -> None:
 2617        """
 2618        It takes a VCF file and updates the INFO column of the variants table in the database with the
 2619        INFO column of the VCF file
 2620
 2621        :param vcf_file: the path to the VCF file
 2622        """
 2623
 2624        # varaints table
 2625        table_variants = self.get_table_variants()
 2626
 2627        # Loading VCF into temporaire table
 2628        skip = self.get_header_length(file=vcf_file)
 2629        vcf_df = pd.read_csv(
 2630            vcf_file,
 2631            sep="\t",
 2632            engine="c",
 2633            skiprows=skip,
 2634            header=0,
 2635            low_memory=False,
 2636        )
 2637        sql_query_update = f"""
 2638        UPDATE {table_variants} as table_variants
 2639            SET INFO = concat(
 2640                            CASE
 2641                                WHEN INFO NOT IN ('', '.')
 2642                                THEN INFO
 2643                                ELSE ''
 2644                            END,
 2645                            (
 2646                                SELECT 
 2647                                    concat(
 2648                                        CASE
 2649                                            WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.')
 2650                                            THEN ';'
 2651                                            ELSE ''
 2652                                        END
 2653                                        ,
 2654                                        CASE
 2655                                            WHEN table_parquet.INFO NOT IN ('','.')
 2656                                            THEN table_parquet.INFO
 2657                                            ELSE ''
 2658                                        END
 2659                                    )
 2660                                FROM vcf_df as table_parquet
 2661                                        WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR)
 2662                                        AND table_parquet.\"POS\" = table_variants.\"POS\"
 2663                                        AND table_parquet.\"ALT\" = table_variants.\"ALT\"
 2664                                        AND table_parquet.\"REF\" = table_variants.\"REF\"
 2665                                        AND table_parquet.INFO NOT IN ('','.')
 2666                            )
 2667                        )
 2668            ;
 2669            """
 2670        self.conn.execute(sql_query_update)
 2671
 2672    def update_from_vcf_sqlite(self, vcf_file: str) -> None:
 2673        """
 2674        It creates a temporary table in the SQLite database, loads the VCF file into the temporary
 2675        table, then updates the INFO column of the variants table with the INFO column of the temporary
 2676        table
 2677
 2678        :param vcf_file: The path to the VCF file you want to update the database with
 2679        """
 2680
 2681        # Create a temporary table for the VCF
 2682        table_vcf = "tmp_vcf"
 2683        sql_create = (
 2684            f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0"
 2685        )
 2686        self.conn.execute(sql_create)
 2687
 2688        # Loading VCF into temporaire table
 2689        vcf_df = pd.read_csv(
 2690            vcf_file, sep="\t", comment="#", header=None, low_memory=False
 2691        )
 2692        vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"]
 2693        vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False)
 2694
 2695        # Update table 'variants' with VCF data
 2696        # warning: CONCAT as || operator
 2697        sql_query_update = f"""
 2698            UPDATE variants as table_variants
 2699            SET INFO = CASE
 2700                            WHEN INFO NOT IN ('', '.')
 2701                            THEN INFO
 2702                            ELSE ''
 2703                        END ||
 2704                        (
 2705                        SELECT 
 2706                            CASE 
 2707                                WHEN table_variants.INFO NOT IN ('','.') 
 2708                                    AND table_vcf.INFO NOT IN ('','.')  
 2709                                THEN ';' 
 2710                                ELSE '' 
 2711                            END || 
 2712                            CASE 
 2713                                WHEN table_vcf.INFO NOT IN ('','.') 
 2714                                THEN table_vcf.INFO 
 2715                                ELSE '' 
 2716                            END
 2717                        FROM {table_vcf} as table_vcf
 2718                        WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\"
 2719                            AND table_vcf.\"POS\" = table_variants.\"POS\"
 2720                            AND table_vcf.\"ALT\" = table_variants.\"ALT\"
 2721                            AND table_vcf.\"REF\" = table_variants.\"REF\"
 2722                        )
 2723        """
 2724        self.conn.execute(sql_query_update)
 2725
 2726        # Drop temporary table
 2727        sql_drop = f"DROP TABLE {table_vcf}"
 2728        self.conn.execute(sql_drop)
 2729
 2730    def drop_variants_table(self) -> None:
 2731        """
 2732        > This function drops the variants table
 2733        """
 2734
 2735        table_variants = self.get_table_variants()
 2736        sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}"
 2737        self.conn.execute(sql_table_variants)
 2738
 2739    def set_variant_id(
 2740        self, variant_id_column: str = "variant_id", force: bool = None
 2741    ) -> str:
 2742        """
 2743        It adds a column to the variants table called `variant_id` and populates it with a hash of the
 2744        `#CHROM`, `POS`, `REF`, and `ALT` columns
 2745
 2746        :param variant_id_column: The name of the column to be created in the variants table, defaults
 2747        to variant_id
 2748        :type variant_id_column: str (optional)
 2749        :param force: If True, the variant_id column will be created even if it already exists
 2750        :type force: bool
 2751        :return: The name of the column that contains the variant_id
 2752        """
 2753
 2754        # Assembly
 2755        assembly = self.get_param().get(
 2756            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 2757        )
 2758
 2759        # INFO/Tag prefix
 2760        prefix = self.get_explode_infos_prefix()
 2761
 2762        # Explode INFO/SVTYPE
 2763        added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"])
 2764
 2765        # variants table
 2766        table_variants = self.get_table_variants()
 2767
 2768        # variant_id column
 2769        if not variant_id_column:
 2770            variant_id_column = "variant_id"
 2771
 2772        # Creta variant_id column
 2773        if "variant_id" not in self.get_extra_infos() or force:
 2774
 2775            # Create column
 2776            self.add_column(
 2777                table_name=table_variants,
 2778                column_name=variant_id_column,
 2779                column_type="UBIGINT",
 2780                default_value="0",
 2781            )
 2782
 2783            # Update column
 2784            self.conn.execute(
 2785                f"""
 2786                    UPDATE {table_variants}
 2787                    SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"')
 2788                """
 2789            )
 2790
 2791        # Remove added columns
 2792        for added_column in added_columns:
 2793            self.drop_column(column=added_column)
 2794
 2795        # return variant_id column name
 2796        return variant_id_column
 2797
 2798    def get_variant_id_column(
 2799        self, variant_id_column: str = "variant_id", force: bool = None
 2800    ) -> str:
 2801        """
 2802        This function returns the variant_id column name
 2803
 2804        :param variant_id_column: The name of the column in the dataframe that contains the variant IDs,
 2805        defaults to variant_id
 2806        :type variant_id_column: str (optional)
 2807        :param force: If True, will force the variant_id to be set to the value of variant_id_column. If
 2808        False, will only set the variant_id if it is not already set. If None, will set the variant_id
 2809        if it is not already set, or if it is set
 2810        :type force: bool
 2811        :return: The variant_id column name.
 2812        """
 2813
 2814        return self.set_variant_id(variant_id_column=variant_id_column, force=force)
 2815
 2816    ###
 2817    # Annotation
 2818    ###
 2819
 2820    def scan_databases(
 2821        self,
 2822        database_formats: list = ["parquet"],
 2823        database_releases: list = ["current"],
 2824    ) -> dict:
 2825        """
 2826        The function `scan_databases` scans for available databases based on specified formats and
 2827        releases.
 2828
 2829        :param database_formats: The `database_formats` parameter is a list that specifies the formats
 2830        of the databases to be scanned. In this case, the accepted format is "parquet"
 2831        :type database_formats: list ["parquet"]
 2832        :param database_releases: The `database_releases` parameter is a list that specifies the
 2833        releases of the databases to be scanned. In the provided function, the default value for
 2834        `database_releases` is set to `["current"]`, meaning that by default, the function will scan
 2835        databases that are in the "current"
 2836        :type database_releases: list
 2837        :return: The function `scan_databases` returns a dictionary containing information about
 2838        databases that match the specified formats and releases.
 2839        """
 2840
 2841        # Config
 2842        config = self.get_config()
 2843
 2844        # Param
 2845        param = self.get_param()
 2846
 2847        # Param - Assembly
 2848        assembly = param.get("assembly", config.get("assembly", None))
 2849        if not assembly:
 2850            assembly = DEFAULT_ASSEMBLY
 2851            log.warning(f"Default assembly '{assembly}'")
 2852
 2853        # Scan for availabled databases
 2854        log.info(
 2855            f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..."
 2856        )
 2857        databases_infos_dict = databases_infos(
 2858            database_folder_releases=database_releases,
 2859            database_formats=database_formats,
 2860            assembly=assembly,
 2861            config=config,
 2862        )
 2863        log.info(
 2864            f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found"
 2865        )
 2866
 2867        return databases_infos_dict
 2868
 2869    def annotation(self) -> None:
 2870        """
 2871        It annotates the VCF file with the annotations specified in the config file.
 2872        """
 2873
 2874        # Config
 2875        config = self.get_config()
 2876
 2877        # Param
 2878        param = self.get_param()
 2879
 2880        # Param - Assembly
 2881        assembly = param.get("assembly", config.get("assembly", None))
 2882        if not assembly:
 2883            assembly = DEFAULT_ASSEMBLY
 2884            log.warning(f"Default assembly '{assembly}'")
 2885
 2886        # annotations databases folders
 2887        annotations_databases = set(
 2888            config.get("folders", {})
 2889            .get("databases", {})
 2890            .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER])
 2891            + config.get("folders", {})
 2892            .get("databases", {})
 2893            .get("parquet", ["~/howard/databases/parquet/current"])
 2894            + config.get("folders", {})
 2895            .get("databases", {})
 2896            .get("bcftools", ["~/howard/databases/bcftools/current"])
 2897        )
 2898
 2899        # Get param annotations
 2900        if param.get("annotations", None) and isinstance(
 2901            param.get("annotations", None), str
 2902        ):
 2903            log.debug(param.get("annotations", None))
 2904            param_annotation_list = param.get("annotations").split(",")
 2905        else:
 2906            param_annotation_list = []
 2907
 2908        # Each tools param
 2909        if param.get("annotation_parquet", None) != None:
 2910            log.debug(
 2911                f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}"""
 2912            )
 2913            if isinstance(param.get("annotation_parquet", None), list):
 2914                param_annotation_list.append(",".join(param.get("annotation_parquet")))
 2915            else:
 2916                param_annotation_list.append(param.get("annotation_parquet"))
 2917        if param.get("annotation_snpsift", None) != None:
 2918            if isinstance(param.get("annotation_snpsift", None), list):
 2919                param_annotation_list.append(
 2920                    "snpsift:"
 2921                    + "+".join(param.get("annotation_snpsift")).replace(",", "+")
 2922                )
 2923            else:
 2924                param_annotation_list.append(
 2925                    "snpsift:" + param.get("annotation_snpsift").replace(",", "+")
 2926                )
 2927        if param.get("annotation_snpeff", None) != None:
 2928            param_annotation_list.append("snpeff:" + param.get("annotation_snpeff"))
 2929        if param.get("annotation_bcftools", None) != None:
 2930            if isinstance(param.get("annotation_bcftools", None), list):
 2931                param_annotation_list.append(
 2932                    "bcftools:"
 2933                    + "+".join(param.get("annotation_bcftools")).replace(",", "+")
 2934                )
 2935            else:
 2936                param_annotation_list.append(
 2937                    "bcftools:" + param.get("annotation_bcftools").replace(",", "+")
 2938                )
 2939        if param.get("annotation_annovar", None) != None:
 2940            param_annotation_list.append("annovar:" + param.get("annotation_annovar"))
 2941        if param.get("annotation_exomiser", None) != None:
 2942            param_annotation_list.append("exomiser:" + param.get("annotation_exomiser"))
 2943        if param.get("annotation_splice", None) != None:
 2944            param_annotation_list.append("splice:" + param.get("annotation_splice"))
 2945
 2946        # Merge param annotations list
 2947        param["annotations"] = ",".join(param_annotation_list)
 2948
 2949        # debug
 2950        log.debug(f"param_annotations={param['annotations']}")
 2951
 2952        if param.get("annotations"):
 2953
 2954            # Log
 2955            # log.info("Annotations - Check annotation parameters")
 2956
 2957            if not "annotation" in param:
 2958                param["annotation"] = {}
 2959
 2960            # List of annotations parameters
 2961            annotations_list_input = {}
 2962            if isinstance(param.get("annotations", None), str):
 2963                annotation_file_list = [
 2964                    value for value in param.get("annotations", "").split(",")
 2965                ]
 2966                for annotation_file in annotation_file_list:
 2967                    annotations_list_input[annotation_file.strip()] = {"INFO": None}
 2968            else:
 2969                annotations_list_input = param.get("annotations", {})
 2970
 2971            log.info(f"Quick Annotations:")
 2972            for annotation_key in list(annotations_list_input.keys()):
 2973                log.info(f"   {annotation_key}")
 2974
 2975            # List of annotations and associated fields
 2976            annotations_list = {}
 2977
 2978            for annotation_file in annotations_list_input:
 2979
 2980                # Explode annotations if ALL
 2981                if (
 2982                    annotation_file.upper() == "ALL"
 2983                    or annotation_file.upper().startswith("ALL:")
 2984                ):
 2985
 2986                    # check ALL parameters (formats, releases)
 2987                    annotation_file_split = annotation_file.split(":")
 2988                    database_formats = "parquet"
 2989                    database_releases = "current"
 2990                    for annotation_file_option in annotation_file_split[1:]:
 2991                        database_all_options_split = annotation_file_option.split("=")
 2992                        if database_all_options_split[0] == "format":
 2993                            database_formats = database_all_options_split[1].split("+")
 2994                        if database_all_options_split[0] == "release":
 2995                            database_releases = database_all_options_split[1].split("+")
 2996
 2997                    # Scan for availabled databases
 2998                    databases_infos_dict = self.scan_databases(
 2999                        database_formats=database_formats,
 3000                        database_releases=database_releases,
 3001                    )
 3002
 3003                    # Add found databases in annotation parameters
 3004                    for database_infos in databases_infos_dict.keys():
 3005                        annotations_list[database_infos] = {"INFO": None}
 3006
 3007                else:
 3008                    annotations_list[annotation_file] = annotations_list_input[
 3009                        annotation_file
 3010                    ]
 3011
 3012            # Check each databases
 3013            if len(annotations_list):
 3014
 3015                log.info(
 3016                    f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..."
 3017                )
 3018
 3019                for annotation_file in annotations_list:
 3020
 3021                    # Init
 3022                    annotations = annotations_list.get(annotation_file, None)
 3023
 3024                    # Annotation snpEff
 3025                    if annotation_file.startswith("snpeff"):
 3026
 3027                        log.debug(f"Quick Annotation snpEff")
 3028
 3029                        if "snpeff" not in param["annotation"]:
 3030                            param["annotation"]["snpeff"] = {}
 3031
 3032                        if "options" not in param["annotation"]["snpeff"]:
 3033                            param["annotation"]["snpeff"]["options"] = ""
 3034
 3035                        # snpEff options in annotations
 3036                        param["annotation"]["snpeff"]["options"] = "".join(
 3037                            annotation_file.split(":")[1:]
 3038                        )
 3039
 3040                    # Annotation Annovar
 3041                    elif annotation_file.startswith("annovar"):
 3042
 3043                        log.debug(f"Quick Annotation Annovar")
 3044
 3045                        if "annovar" not in param["annotation"]:
 3046                            param["annotation"]["annovar"] = {}
 3047
 3048                        if "annotations" not in param["annotation"]["annovar"]:
 3049                            param["annotation"]["annovar"]["annotations"] = {}
 3050
 3051                        # Options
 3052                        annotation_file_split = annotation_file.split(":")
 3053                        for annotation_file_annotation in annotation_file_split[1:]:
 3054                            if annotation_file_annotation:
 3055                                param["annotation"]["annovar"]["annotations"][
 3056                                    annotation_file_annotation
 3057                                ] = annotations
 3058
 3059                    # Annotation Exomiser
 3060                    elif annotation_file.startswith("exomiser"):
 3061
 3062                        log.debug(f"Quick Annotation Exomiser")
 3063
 3064                        param["annotation"]["exomiser"] = params_string_to_dict(
 3065                            annotation_file
 3066                        )
 3067
 3068                    # Annotation Splice
 3069                    elif annotation_file.startswith("splice"):
 3070
 3071                        log.debug(f"Quick Annotation Splice")
 3072
 3073                        param["annotation"]["splice"] = params_string_to_dict(
 3074                            annotation_file
 3075                        )
 3076
 3077                    # Annotation Parquet or BCFTOOLS
 3078                    else:
 3079
 3080                        # Tools detection
 3081                        if annotation_file.startswith("bcftools:"):
 3082                            annotation_tool_initial = "bcftools"
 3083                            annotation_file = ":".join(annotation_file.split(":")[1:])
 3084                        elif annotation_file.startswith("snpsift:"):
 3085                            annotation_tool_initial = "snpsift"
 3086                            annotation_file = ":".join(annotation_file.split(":")[1:])
 3087                        elif annotation_file.startswith("bigwig:"):
 3088                            annotation_tool_initial = "bigwig"
 3089                            annotation_file = ":".join(annotation_file.split(":")[1:])
 3090                        else:
 3091                            annotation_tool_initial = None
 3092
 3093                        # list of files
 3094                        annotation_file_list = annotation_file.replace("+", ":").split(
 3095                            ":"
 3096                        )
 3097
 3098                        for annotation_file in annotation_file_list:
 3099
 3100                            if annotation_file:
 3101
 3102                                # Annotation tool initial
 3103                                annotation_tool = annotation_tool_initial
 3104
 3105                                # Find file
 3106                                annotation_file_found = None
 3107
 3108                                if os.path.exists(annotation_file):
 3109                                    annotation_file_found = annotation_file
 3110                                elif os.path.exists(full_path(annotation_file)):
 3111                                    annotation_file_found = full_path(annotation_file)
 3112                                else:
 3113                                    # Find within assembly folders
 3114                                    for annotations_database in annotations_databases:
 3115                                        found_files = find_all(
 3116                                            annotation_file,
 3117                                            os.path.join(
 3118                                                annotations_database, assembly
 3119                                            ),
 3120                                        )
 3121                                        if len(found_files) > 0:
 3122                                            annotation_file_found = found_files[0]
 3123                                            break
 3124                                    if not annotation_file_found and not assembly:
 3125                                        # Find within folders
 3126                                        for (
 3127                                            annotations_database
 3128                                        ) in annotations_databases:
 3129                                            found_files = find_all(
 3130                                                annotation_file, annotations_database
 3131                                            )
 3132                                            if len(found_files) > 0:
 3133                                                annotation_file_found = found_files[0]
 3134                                                break
 3135                                log.debug(
 3136                                    f"for {annotation_file} annotation_file_found={annotation_file_found}"
 3137                                )
 3138
 3139                                # Full path
 3140                                annotation_file_found = full_path(annotation_file_found)
 3141
 3142                                if annotation_file_found:
 3143
 3144                                    database = Database(database=annotation_file_found)
 3145                                    quick_annotation_format = database.get_format()
 3146                                    quick_annotation_is_compressed = (
 3147                                        database.is_compressed()
 3148                                    )
 3149                                    quick_annotation_is_indexed = os.path.exists(
 3150                                        f"{annotation_file_found}.tbi"
 3151                                    )
 3152                                    bcftools_preference = False
 3153
 3154                                    # Check Annotation Tool
 3155                                    if not annotation_tool:
 3156                                        if (
 3157                                            bcftools_preference
 3158                                            and quick_annotation_format
 3159                                            in ["vcf", "bed"]
 3160                                            and quick_annotation_is_compressed
 3161                                            and quick_annotation_is_indexed
 3162                                        ):
 3163                                            annotation_tool = "bcftools"
 3164                                        elif quick_annotation_format in [
 3165                                            "vcf",
 3166                                            "bed",
 3167                                            "tsv",
 3168                                            "tsv",
 3169                                            "csv",
 3170                                            "json",
 3171                                            "tbl",
 3172                                            "parquet",
 3173                                            "duckdb",
 3174                                        ]:
 3175                                            annotation_tool = "parquet"
 3176                                        elif quick_annotation_format in ["bw"]:
 3177                                            annotation_tool = "bigwig"
 3178                                        else:
 3179                                            log.error(
 3180                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
 3181                                            )
 3182                                            raise ValueError(
 3183                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
 3184                                            )
 3185
 3186                                    log.debug(
 3187                                        f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}"
 3188                                    )
 3189
 3190                                    # Annotation Tool dispatch
 3191                                    if annotation_tool:
 3192                                        if annotation_tool not in param["annotation"]:
 3193                                            param["annotation"][annotation_tool] = {}
 3194                                        if (
 3195                                            "annotations"
 3196                                            not in param["annotation"][annotation_tool]
 3197                                        ):
 3198                                            param["annotation"][annotation_tool][
 3199                                                "annotations"
 3200                                            ] = {}
 3201                                        param["annotation"][annotation_tool][
 3202                                            "annotations"
 3203                                        ][annotation_file_found] = annotations
 3204
 3205                                else:
 3206                                    log.warning(
 3207                                        f"Quick Annotation File {annotation_file} does NOT exist"
 3208                                    )
 3209
 3210                self.set_param(param)
 3211
 3212        if param.get("annotation", None):
 3213            log.info("Annotations")
 3214            if param.get("annotation", {}).get("parquet", None):
 3215                log.info("Annotations 'parquet'...")
 3216                self.annotation_parquet()
 3217            if param.get("annotation", {}).get("bcftools", None):
 3218                log.info("Annotations 'bcftools'...")
 3219                self.annotation_bcftools()
 3220            if param.get("annotation", {}).get("snpsift", None):
 3221                log.info("Annotations 'snpsift'...")
 3222                self.annotation_snpsift()
 3223            if param.get("annotation", {}).get("bigwig", None):
 3224                log.info("Annotations 'bigwig'...")
 3225                self.annotation_bigwig()
 3226            if param.get("annotation", {}).get("annovar", None):
 3227                log.info("Annotations 'annovar'...")
 3228                self.annotation_annovar()
 3229            if param.get("annotation", {}).get("snpeff", None):
 3230                log.info("Annotations 'snpeff'...")
 3231                self.annotation_snpeff()
 3232            if param.get("annotation", {}).get("exomiser", None) is not None:
 3233                log.info("Annotations 'exomiser'...")
 3234                self.annotation_exomiser()
 3235            if param.get("annotation", {}).get("splice", None) is not None:
 3236                log.info("Annotations 'splice' ...")
 3237                self.annotation_splice()
 3238
 3239        # Explode INFOS fields into table fields
 3240        if self.get_explode_infos():
 3241            self.explode_infos(
 3242                prefix=self.get_explode_infos_prefix(),
 3243                fields=self.get_explode_infos_fields(),
 3244                force=True,
 3245            )
 3246
 3247    def annotation_bigwig(self, threads: int = None) -> None:
 3248        """
 3249        The function `annotation_bigwig` annotates variants in a VCF file using bigwig databases.
 3250
 3251        :param threads: The `threads` parameter in the `annotation_bigwig` method is used to specify the
 3252        number of threads to be used for parallel processing during the annotation process. If the
 3253        `threads` parameter is not provided, the method will attempt to determine the optimal number of
 3254        threads to use based on the system configuration
 3255        :type threads: int
 3256        :return: True
 3257        """
 3258
 3259        # DEBUG
 3260        log.debug("Start annotation with bigwig databases")
 3261
 3262        # # Threads
 3263        # if not threads:
 3264        #     threads = self.get_threads()
 3265        # log.debug("Threads: " + str(threads))
 3266
 3267        # Config
 3268        config = self.get_config()
 3269        log.debug("Config: " + str(config))
 3270
 3271        # Config - BCFTools databases folders
 3272        databases_folders = set(
 3273            self.get_config()
 3274            .get("folders", {})
 3275            .get("databases", {})
 3276            .get("annotations", ["."])
 3277            + self.get_config()
 3278            .get("folders", {})
 3279            .get("databases", {})
 3280            .get("bigwig", ["."])
 3281        )
 3282        log.debug("Databases annotations: " + str(databases_folders))
 3283
 3284        # Param
 3285        annotations = (
 3286            self.get_param()
 3287            .get("annotation", {})
 3288            .get("bigwig", {})
 3289            .get("annotations", None)
 3290        )
 3291        log.debug("Annotations: " + str(annotations))
 3292
 3293        # Assembly
 3294        assembly = self.get_param().get(
 3295            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 3296        )
 3297
 3298        # Data
 3299        table_variants = self.get_table_variants()
 3300
 3301        # Check if not empty
 3302        log.debug("Check if not empty")
 3303        sql_query_chromosomes = (
 3304            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 3305        )
 3306        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 3307        if not sql_query_chromosomes_df["count"][0]:
 3308            log.info(f"VCF empty")
 3309            return
 3310
 3311        # VCF header
 3312        vcf_reader = self.get_header()
 3313        log.debug("Initial header: " + str(vcf_reader.infos))
 3314
 3315        # Existing annotations
 3316        for vcf_annotation in self.get_header().infos:
 3317
 3318            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 3319            log.debug(
 3320                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 3321            )
 3322
 3323        if annotations:
 3324
 3325            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
 3326
 3327                # Export VCF file
 3328                tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz")
 3329
 3330                # annotation_bigwig_config
 3331                annotation_bigwig_config_list = []
 3332
 3333                for annotation in annotations:
 3334                    annotation_fields = annotations[annotation]
 3335
 3336                    # Annotation Name
 3337                    annotation_name = os.path.basename(annotation)
 3338
 3339                    if not annotation_fields:
 3340                        annotation_fields = {"INFO": None}
 3341
 3342                    log.debug(f"Annotation '{annotation_name}'")
 3343                    log.debug(
 3344                        f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 3345                    )
 3346
 3347                    # Create Database
 3348                    database = Database(
 3349                        database=annotation,
 3350                        databases_folders=databases_folders,
 3351                        assembly=assembly,
 3352                    )
 3353
 3354                    # Find files
 3355                    db_file = database.get_database()
 3356                    db_file = full_path(db_file)
 3357                    db_hdr_file = database.get_header_file()
 3358                    db_hdr_file = full_path(db_hdr_file)
 3359                    db_file_type = database.get_format()
 3360
 3361                    # If db_file is http ?
 3362                    if database.get_database().startswith("http"):
 3363
 3364                        # Datbase is HTTP URL
 3365                        db_file_is_http = True
 3366
 3367                        # DB file keep as URL
 3368                        db_file = database.get_database()
 3369                        log.warning(
 3370                            f"Annotations 'bigwig' database '{db_file}' - is an HTTP URL (experimental)"
 3371                        )
 3372
 3373                        # Retrieve automatic annotation field name
 3374                        annotation_field = clean_annotation_field(
 3375                            os.path.basename(db_file).replace(".bw", "")
 3376                        )
 3377                        log.debug(
 3378                            f"Create header file with annotation field '{annotation_field}' is an HTTP URL"
 3379                        )
 3380
 3381                        # Create automatic header file
 3382                        db_hdr_file = os.path.join(tmp_dir, "header.hdr")
 3383                        with open(db_hdr_file, "w") as f:
 3384                            f.write("##fileformat=VCFv4.2\n")
 3385                            f.write(
 3386                                f"""##INFO=<ID={annotation_field},Number=.,Type=Float,Description="{annotation_field} annotation from {db_file}">\n"""
 3387                            )
 3388                            f.write(f"#CHROM	START	END	{annotation_field}\n")
 3389
 3390                    else:
 3391
 3392                        # Datbase is NOT HTTP URL
 3393                        db_file_is_http = False
 3394
 3395                    # Check index - try to create if not exists
 3396                    if (
 3397                        db_file is None
 3398                        or db_hdr_file is None
 3399                        or (not os.path.exists(db_file) and not db_file_is_http)
 3400                        or not os.path.exists(db_hdr_file)
 3401                        or not db_file_type in ["bw"]
 3402                    ):
 3403                        # if False:
 3404                        log.error("Annotation failed: database not valid")
 3405                        log.error(f"Annotation annotation file: {db_file}")
 3406                        log.error(f"Annotation annotation file type: {db_file_type}")
 3407                        log.error(f"Annotation annotation header: {db_hdr_file}")
 3408                        raise ValueError(
 3409                            f"Annotation failed: database not valid - annotation file {db_file} / annotation file type {db_file_type} / annotation header {db_hdr_file}"
 3410                        )
 3411                    else:
 3412
 3413                        # Log
 3414                        log.debug(
 3415                            f"Annotation '{annotation}' - file: "
 3416                            + str(db_file)
 3417                            + " and "
 3418                            + str(db_hdr_file)
 3419                        )
 3420
 3421                        # Load header as VCF object
 3422                        db_hdr_vcf = Variants(input=db_hdr_file)
 3423                        db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
 3424                        log.debug(
 3425                            "Annotation database header: "
 3426                            + str(db_hdr_vcf_header_infos)
 3427                        )
 3428
 3429                        # For all fields in database
 3430                        annotation_fields_full = False
 3431                        if "ALL" in annotation_fields or "INFO" in annotation_fields:
 3432                            annotation_fields = {
 3433                                key: key for key in db_hdr_vcf_header_infos
 3434                            }
 3435                            log.debug(
 3436                                "Annotation database header - All annotations added: "
 3437                                + str(annotation_fields)
 3438                            )
 3439                            annotation_fields_full = True
 3440
 3441                        # Init
 3442                        cyvcf2_header_rename_dict = {}
 3443                        cyvcf2_header_list = []
 3444                        cyvcf2_header_indexes = {}
 3445
 3446                        # process annotation fields
 3447                        for annotation_field in annotation_fields:
 3448
 3449                            # New annotation name
 3450                            annotation_field_new = annotation_fields[annotation_field]
 3451
 3452                            # Check annotation field and index in header
 3453                            if (
 3454                                annotation_field
 3455                                in db_hdr_vcf.get_header_columns_as_list()
 3456                            ):
 3457                                annotation_field_index = (
 3458                                    db_hdr_vcf.get_header_columns_as_list().index(
 3459                                        annotation_field
 3460                                    )
 3461                                    - 3
 3462                                )
 3463                                cyvcf2_header_indexes[annotation_field_new] = (
 3464                                    annotation_field_index
 3465                                )
 3466                            else:
 3467                                msg_err = f"Database '{db_file}' does NOT contain annotation field '{annotation_field}'"
 3468                                log.error(msg_err)
 3469                                raise ValueError(msg_err)
 3470
 3471                            # Append annotation field in cyvcf2 header list
 3472                            cyvcf2_header_rename_dict[annotation_field_new] = (
 3473                                db_hdr_vcf_header_infos[annotation_field].id
 3474                            )
 3475                            cyvcf2_header_list.append(
 3476                                {
 3477                                    "ID": annotation_field_new,
 3478                                    "Number": db_hdr_vcf_header_infos[
 3479                                        annotation_field
 3480                                    ].num,
 3481                                    "Type": db_hdr_vcf_header_infos[
 3482                                        annotation_field
 3483                                    ].type,
 3484                                    "Description": db_hdr_vcf_header_infos[
 3485                                        annotation_field
 3486                                    ].desc,
 3487                                }
 3488                            )
 3489
 3490                            # Add header on VCF
 3491                            vcf_reader.infos[annotation_field_new] = vcf.parser._Info(
 3492                                annotation_field_new,
 3493                                db_hdr_vcf_header_infos[annotation_field].num,
 3494                                db_hdr_vcf_header_infos[annotation_field].type,
 3495                                db_hdr_vcf_header_infos[annotation_field].desc,
 3496                                "HOWARD BigWig annotation",
 3497                                "unknown",
 3498                                self.code_type_map[
 3499                                    db_hdr_vcf_header_infos[annotation_field].type
 3500                                ],
 3501                            )
 3502
 3503                        # Load bigwig database
 3504                        bw_db = pyBigWig.open(db_file)
 3505                        if bw_db.isBigWig():
 3506                            log.debug(f"Database '{db_file}' is in 'BigWig' format")
 3507                        else:
 3508                            msg_err = f"Database '{db_file}' is NOT in 'BigWig' format"
 3509                            log.error(msg_err)
 3510                            raise ValueError(msg_err)
 3511
 3512                        annotation_bigwig_config_list.append(
 3513                            {
 3514                                "db_file": db_file,
 3515                                "bw_db": bw_db,
 3516                                "cyvcf2_header_rename_dict": cyvcf2_header_rename_dict,
 3517                                "cyvcf2_header_list": cyvcf2_header_list,
 3518                                "cyvcf2_header_indexes": cyvcf2_header_indexes,
 3519                            }
 3520                        )
 3521
 3522                # Annotate
 3523                if annotation_bigwig_config_list:
 3524
 3525                    # Annotation config
 3526                    log.debug(
 3527                        f"annotation_bigwig_config={annotation_bigwig_config_list}"
 3528                    )
 3529
 3530                    # Export VCF file
 3531                    self.export_variant_vcf(
 3532                        vcf_file=tmp_vcf_name,
 3533                        remove_info=True,
 3534                        add_samples=False,
 3535                        index=True,
 3536                    )
 3537
 3538                    # Load input tmp file
 3539                    input_vcf = cyvcf2.VCF(tmp_vcf_name)
 3540
 3541                    # Add header in input file
 3542                    for annotation_bigwig_config in annotation_bigwig_config_list:
 3543                        for cyvcf2_header_field in annotation_bigwig_config.get(
 3544                            "cyvcf2_header_list", []
 3545                        ):
 3546                            log.info(
 3547                                f"Annotations 'bigwig' database '{os.path.basename(annotation_bigwig_config.get('db_file'))}' - annotation field '{annotation_bigwig_config.get('cyvcf2_header_rename_dict',{}).get(cyvcf2_header_field.get('ID','Unknown'))}' -> '{cyvcf2_header_field.get('ID')}'"
 3548                            )
 3549                            input_vcf.add_info_to_header(cyvcf2_header_field)
 3550
 3551                    # Create output VCF file
 3552                    output_vcf_file = os.path.join(tmp_dir, "output.vcf.gz")
 3553                    output_vcf = cyvcf2.Writer(output_vcf_file, input_vcf)
 3554
 3555                    # Fetch variants
 3556                    log.info(f"Annotations 'bigwig' start...")
 3557                    for variant in input_vcf:
 3558
 3559                        for annotation_bigwig_config in annotation_bigwig_config_list:
 3560
 3561                            # DB and indexes
 3562                            bw_db = annotation_bigwig_config.get("bw_db", None)
 3563                            cyvcf2_header_indexes = annotation_bigwig_config.get(
 3564                                "cyvcf2_header_indexes", None
 3565                            )
 3566
 3567                            # Retrieve value from chrom pos
 3568                            res = bw_db.values(
 3569                                variant.CHROM, variant.POS - 1, variant.POS
 3570                            )
 3571
 3572                            # For each annotation fields (and indexes)
 3573                            for cyvcf2_header_index in cyvcf2_header_indexes:
 3574
 3575                                # If value is NOT nNone
 3576                                if not np.isnan(
 3577                                    res[cyvcf2_header_indexes[cyvcf2_header_index]]
 3578                                ):
 3579                                    variant.INFO[cyvcf2_header_index] = res[
 3580                                        cyvcf2_header_indexes[cyvcf2_header_index]
 3581                                    ]
 3582
 3583                        # Add record in output file
 3584                        output_vcf.write_record(variant)
 3585
 3586                    # Log
 3587                    log.debug(f"Annotation done.")
 3588
 3589                    # Close and write file
 3590                    log.info(f"Annotations 'bigwig' write...")
 3591                    output_vcf.close()
 3592                    log.debug(f"Write done.")
 3593
 3594                    # Update variants
 3595                    log.info(f"Annotations 'bigwig' update...")
 3596                    self.update_from_vcf(output_vcf_file)
 3597                    log.debug(f"Update done.")
 3598
 3599        return True
 3600
 3601    def annotation_snpsift(self, threads: int = None) -> None:
 3602        """
 3603        This function annotate with bcftools
 3604
 3605        :param threads: Number of threads to use
 3606        :return: the value of the variable "return_value".
 3607        """
 3608
 3609        # DEBUG
 3610        log.debug("Start annotation with bcftools databases")
 3611
 3612        # Threads
 3613        if not threads:
 3614            threads = self.get_threads()
 3615        log.debug("Threads: " + str(threads))
 3616
 3617        # Config
 3618        config = self.get_config()
 3619        log.debug("Config: " + str(config))
 3620
 3621        # Config - snpSift
 3622        snpsift_bin_command = get_bin_command(
 3623            bin="SnpSift.jar",
 3624            tool="snpsift",
 3625            bin_type="jar",
 3626            config=config,
 3627            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
 3628        )
 3629        if not snpsift_bin_command:
 3630            msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'"
 3631            log.error(msg_err)
 3632            raise ValueError(msg_err)
 3633
 3634        # Config - bcftools
 3635        bcftools_bin_command = get_bin_command(
 3636            bin="bcftools",
 3637            tool="bcftools",
 3638            bin_type="bin",
 3639            config=config,
 3640            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
 3641        )
 3642        if not bcftools_bin_command:
 3643            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
 3644            log.error(msg_err)
 3645            raise ValueError(msg_err)
 3646
 3647        # Config - BCFTools databases folders
 3648        databases_folders = set(
 3649            self.get_config()
 3650            .get("folders", {})
 3651            .get("databases", {})
 3652            .get("annotations", ["."])
 3653            + self.get_config()
 3654            .get("folders", {})
 3655            .get("databases", {})
 3656            .get("bcftools", ["."])
 3657        )
 3658        log.debug("Databases annotations: " + str(databases_folders))
 3659
 3660        # Param
 3661        annotations = (
 3662            self.get_param()
 3663            .get("annotation", {})
 3664            .get("snpsift", {})
 3665            .get("annotations", None)
 3666        )
 3667        log.debug("Annotations: " + str(annotations))
 3668
 3669        # Assembly
 3670        assembly = self.get_param().get(
 3671            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 3672        )
 3673
 3674        # Data
 3675        table_variants = self.get_table_variants()
 3676
 3677        # Check if not empty
 3678        log.debug("Check if not empty")
 3679        sql_query_chromosomes = (
 3680            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 3681        )
 3682        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 3683        if not sql_query_chromosomes_df["count"][0]:
 3684            log.info(f"VCF empty")
 3685            return
 3686
 3687        # VCF header
 3688        vcf_reader = self.get_header()
 3689        log.debug("Initial header: " + str(vcf_reader.infos))
 3690
 3691        # Existing annotations
 3692        for vcf_annotation in self.get_header().infos:
 3693
 3694            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 3695            log.debug(
 3696                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 3697            )
 3698
 3699        if annotations:
 3700
 3701            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
 3702
 3703                # Export VCF file
 3704                tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz")
 3705
 3706                # Init
 3707                commands = {}
 3708
 3709                for annotation in annotations:
 3710                    annotation_fields = annotations[annotation]
 3711
 3712                    # Annotation Name
 3713                    annotation_name = os.path.basename(annotation)
 3714
 3715                    if not annotation_fields:
 3716                        annotation_fields = {"INFO": None}
 3717
 3718                    log.debug(f"Annotation '{annotation_name}'")
 3719                    log.debug(
 3720                        f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 3721                    )
 3722
 3723                    # Create Database
 3724                    database = Database(
 3725                        database=annotation,
 3726                        databases_folders=databases_folders,
 3727                        assembly=assembly,
 3728                    )
 3729
 3730                    # Find files
 3731                    db_file = database.get_database()
 3732                    db_file = full_path(db_file)
 3733                    db_hdr_file = database.get_header_file()
 3734                    db_hdr_file = full_path(db_hdr_file)
 3735                    db_file_type = database.get_format()
 3736                    db_tbi_file = f"{db_file}.tbi"
 3737                    db_file_compressed = database.is_compressed()
 3738
 3739                    # Check if compressed
 3740                    if not db_file_compressed:
 3741                        log.error(
 3742                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
 3743                        )
 3744                        raise ValueError(
 3745                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
 3746                        )
 3747
 3748                    # Check if indexed
 3749                    if not os.path.exists(db_tbi_file):
 3750                        log.error(
 3751                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
 3752                        )
 3753                        raise ValueError(
 3754                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
 3755                        )
 3756
 3757                    # Check index - try to create if not exists
 3758                    if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
 3759                        log.error("Annotation failed: database not valid")
 3760                        log.error(f"Annotation annotation file: {db_file}")
 3761                        log.error(f"Annotation annotation header: {db_hdr_file}")
 3762                        log.error(f"Annotation annotation index: {db_tbi_file}")
 3763                        raise ValueError(
 3764                            f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
 3765                        )
 3766                    else:
 3767
 3768                        log.debug(
 3769                            f"Annotation '{annotation}' - file: "
 3770                            + str(db_file)
 3771                            + " and "
 3772                            + str(db_hdr_file)
 3773                        )
 3774
 3775                        # Load header as VCF object
 3776                        db_hdr_vcf = Variants(input=db_hdr_file)
 3777                        db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
 3778                        log.debug(
 3779                            "Annotation database header: "
 3780                            + str(db_hdr_vcf_header_infos)
 3781                        )
 3782
 3783                        # For all fields in database
 3784                        annotation_fields_full = False
 3785                        if "ALL" in annotation_fields or "INFO" in annotation_fields:
 3786                            annotation_fields = {
 3787                                key: key for key in db_hdr_vcf_header_infos
 3788                            }
 3789                            log.debug(
 3790                                "Annotation database header - All annotations added: "
 3791                                + str(annotation_fields)
 3792                            )
 3793                            annotation_fields_full = True
 3794
 3795                        # # Create file for field rename
 3796                        # log.debug("Create file for field rename")
 3797                        # tmp_rename = NamedTemporaryFile(
 3798                        #     prefix=self.get_prefix(),
 3799                        #     dir=self.get_tmp_dir(),
 3800                        #     suffix=".rename",
 3801                        #     delete=False,
 3802                        # )
 3803                        # tmp_rename_name = tmp_rename.name
 3804                        # tmp_files.append(tmp_rename_name)
 3805
 3806                        # Number of fields
 3807                        nb_annotation_field = 0
 3808                        annotation_list = []
 3809                        annotation_infos_rename_list = []
 3810
 3811                        for annotation_field in annotation_fields:
 3812
 3813                            # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
 3814                            annotation_fields_new_name = annotation_fields.get(
 3815                                annotation_field, annotation_field
 3816                            )
 3817                            if not annotation_fields_new_name:
 3818                                annotation_fields_new_name = annotation_field
 3819
 3820                            # Check if field is in DB and if field is not elready in input data
 3821                            if (
 3822                                annotation_field in db_hdr_vcf.get_header().infos
 3823                                and annotation_fields_new_name
 3824                                not in self.get_header().infos
 3825                            ):
 3826
 3827                                log.info(
 3828                                    f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
 3829                                )
 3830
 3831                                # BCFTools annotate param to rename fields
 3832                                if annotation_field != annotation_fields_new_name:
 3833                                    annotation_infos_rename_list.append(
 3834                                        f"{annotation_fields_new_name}:=INFO/{annotation_field}"
 3835                                    )
 3836
 3837                                # Add INFO field to header
 3838                                db_hdr_vcf_header_infos_number = (
 3839                                    db_hdr_vcf_header_infos[annotation_field].num or "."
 3840                                )
 3841                                db_hdr_vcf_header_infos_type = (
 3842                                    db_hdr_vcf_header_infos[annotation_field].type
 3843                                    or "String"
 3844                                )
 3845                                db_hdr_vcf_header_infos_description = (
 3846                                    db_hdr_vcf_header_infos[annotation_field].desc
 3847                                    or f"{annotation_field} description"
 3848                                )
 3849                                db_hdr_vcf_header_infos_source = (
 3850                                    db_hdr_vcf_header_infos[annotation_field].source
 3851                                    or "unknown"
 3852                                )
 3853                                db_hdr_vcf_header_infos_version = (
 3854                                    db_hdr_vcf_header_infos[annotation_field].version
 3855                                    or "unknown"
 3856                                )
 3857
 3858                                vcf_reader.infos[annotation_fields_new_name] = (
 3859                                    vcf.parser._Info(
 3860                                        annotation_fields_new_name,
 3861                                        db_hdr_vcf_header_infos_number,
 3862                                        db_hdr_vcf_header_infos_type,
 3863                                        db_hdr_vcf_header_infos_description,
 3864                                        db_hdr_vcf_header_infos_source,
 3865                                        db_hdr_vcf_header_infos_version,
 3866                                        self.code_type_map[
 3867                                            db_hdr_vcf_header_infos_type
 3868                                        ],
 3869                                    )
 3870                                )
 3871
 3872                                annotation_list.append(annotation_field)
 3873
 3874                                nb_annotation_field += 1
 3875
 3876                            else:
 3877
 3878                                if (
 3879                                    annotation_field
 3880                                    not in db_hdr_vcf.get_header().infos
 3881                                ):
 3882                                    log.warning(
 3883                                        f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file"
 3884                                    )
 3885                                if (
 3886                                    annotation_fields_new_name
 3887                                    in self.get_header().infos
 3888                                ):
 3889                                    log.warning(
 3890                                        f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)"
 3891                                    )
 3892
 3893                        log.info(
 3894                            f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
 3895                        )
 3896
 3897                        annotation_infos = ",".join(annotation_list)
 3898
 3899                        if annotation_infos != "":
 3900
 3901                            # Annotated VCF (and error file)
 3902                            tmp_annotation_vcf_name = os.path.join(
 3903                                tmp_dir, os.path.basename(annotation) + ".vcf.gz"
 3904                            )
 3905                            tmp_annotation_vcf_name_err = (
 3906                                tmp_annotation_vcf_name + ".err"
 3907                            )
 3908
 3909                            # Add fields to annotate
 3910                            if not annotation_fields_full:
 3911                                annotation_infos_option = f"-info {annotation_infos}"
 3912                            else:
 3913                                annotation_infos_option = ""
 3914
 3915                            # Info fields rename
 3916                            if annotation_infos_rename_list:
 3917                                annotation_infos_rename = " -c " + ",".join(
 3918                                    annotation_infos_rename_list
 3919                                )
 3920                            else:
 3921                                annotation_infos_rename = ""
 3922
 3923                            # Annotate command
 3924                            command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
 3925
 3926                            # Add command
 3927                            commands[command_annotate] = tmp_annotation_vcf_name
 3928
 3929                if commands:
 3930
 3931                    # Export VCF file
 3932                    self.export_variant_vcf(
 3933                        vcf_file=tmp_vcf_name,
 3934                        remove_info=True,
 3935                        add_samples=False,
 3936                        index=True,
 3937                    )
 3938                    shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf")
 3939
 3940                    # Num command
 3941                    nb_command = 0
 3942
 3943                    # Annotate
 3944                    for command_annotate in commands:
 3945                        nb_command += 1
 3946                        log.info(
 3947                            f"Annotation - Annotate [{nb_command}/{len(commands)}]..."
 3948                        )
 3949                        log.debug(f"command_annotate={command_annotate}")
 3950                        run_parallel_commands([command_annotate], threads)
 3951
 3952                        # Debug
 3953                        shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf")
 3954
 3955                        # Update variants
 3956                        log.info(
 3957                            f"Annotation - Updating [{nb_command}/{len(commands)}]..."
 3958                        )
 3959                        self.update_from_vcf(commands[command_annotate])
 3960
 3961    def annotation_bcftools(self, threads: int = None) -> None:
 3962        """
 3963        This function annotate with bcftools
 3964
 3965        :param threads: Number of threads to use
 3966        :return: the value of the variable "return_value".
 3967        """
 3968
 3969        # DEBUG
 3970        log.debug("Start annotation with bcftools databases")
 3971
 3972        # Threads
 3973        if not threads:
 3974            threads = self.get_threads()
 3975        log.debug("Threads: " + str(threads))
 3976
 3977        # Config
 3978        config = self.get_config()
 3979        log.debug("Config: " + str(config))
 3980
 3981        # DEBUG
 3982        delete_tmp = True
 3983        if self.get_config().get("verbosity", "warning") in ["debug"]:
 3984            delete_tmp = False
 3985            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 3986
 3987        # Config - BCFTools bin command
 3988        bcftools_bin_command = get_bin_command(
 3989            bin="bcftools",
 3990            tool="bcftools",
 3991            bin_type="bin",
 3992            config=config,
 3993            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
 3994        )
 3995        if not bcftools_bin_command:
 3996            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
 3997            log.error(msg_err)
 3998            raise ValueError(msg_err)
 3999
 4000        # Config - BCFTools databases folders
 4001        databases_folders = set(
 4002            self.get_config()
 4003            .get("folders", {})
 4004            .get("databases", {})
 4005            .get("annotations", ["."])
 4006            + self.get_config()
 4007            .get("folders", {})
 4008            .get("databases", {})
 4009            .get("bcftools", ["."])
 4010        )
 4011        log.debug("Databases annotations: " + str(databases_folders))
 4012
 4013        # Param
 4014        annotations = (
 4015            self.get_param()
 4016            .get("annotation", {})
 4017            .get("bcftools", {})
 4018            .get("annotations", None)
 4019        )
 4020        log.debug("Annotations: " + str(annotations))
 4021
 4022        # Assembly
 4023        assembly = self.get_param().get(
 4024            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 4025        )
 4026
 4027        # Data
 4028        table_variants = self.get_table_variants()
 4029
 4030        # Check if not empty
 4031        log.debug("Check if not empty")
 4032        sql_query_chromosomes = (
 4033            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 4034        )
 4035        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 4036        if not sql_query_chromosomes_df["count"][0]:
 4037            log.info(f"VCF empty")
 4038            return
 4039
 4040        # Export in VCF
 4041        log.debug("Create initial file to annotate")
 4042        tmp_vcf = NamedTemporaryFile(
 4043            prefix=self.get_prefix(),
 4044            dir=self.get_tmp_dir(),
 4045            suffix=".vcf.gz",
 4046            delete=False,
 4047        )
 4048        tmp_vcf_name = tmp_vcf.name
 4049
 4050        # VCF header
 4051        vcf_reader = self.get_header()
 4052        log.debug("Initial header: " + str(vcf_reader.infos))
 4053
 4054        # Existing annotations
 4055        for vcf_annotation in self.get_header().infos:
 4056
 4057            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 4058            log.debug(
 4059                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 4060            )
 4061
 4062        if annotations:
 4063
 4064            tmp_ann_vcf_list = []
 4065            commands = []
 4066            tmp_files = []
 4067            err_files = []
 4068
 4069            for annotation in annotations:
 4070                annotation_fields = annotations[annotation]
 4071
 4072                # Annotation Name
 4073                annotation_name = os.path.basename(annotation)
 4074
 4075                if not annotation_fields:
 4076                    annotation_fields = {"INFO": None}
 4077
 4078                log.debug(f"Annotation '{annotation_name}'")
 4079                log.debug(
 4080                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 4081                )
 4082
 4083                # Create Database
 4084                database = Database(
 4085                    database=annotation,
 4086                    databases_folders=databases_folders,
 4087                    assembly=assembly,
 4088                )
 4089
 4090                # Find files
 4091                db_file = database.get_database()
 4092                db_file = full_path(db_file)
 4093                db_hdr_file = database.get_header_file()
 4094                db_hdr_file = full_path(db_hdr_file)
 4095                db_file_type = database.get_format()
 4096                db_tbi_file = f"{db_file}.tbi"
 4097                db_file_compressed = database.is_compressed()
 4098
 4099                # Check if compressed
 4100                if not db_file_compressed:
 4101                    log.error(
 4102                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
 4103                    )
 4104                    raise ValueError(
 4105                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
 4106                    )
 4107
 4108                # Check if indexed
 4109                if not os.path.exists(db_tbi_file):
 4110                    log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file")
 4111                    raise ValueError(
 4112                        f"Annotation '{annotation}' - {db_file} NOT indexed file"
 4113                    )
 4114
 4115                # Check index - try to create if not exists
 4116                if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
 4117                    log.error("Annotation failed: database not valid")
 4118                    log.error(f"Annotation annotation file: {db_file}")
 4119                    log.error(f"Annotation annotation header: {db_hdr_file}")
 4120                    log.error(f"Annotation annotation index: {db_tbi_file}")
 4121                    raise ValueError(
 4122                        f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
 4123                    )
 4124                else:
 4125
 4126                    log.debug(
 4127                        f"Annotation '{annotation}' - file: "
 4128                        + str(db_file)
 4129                        + " and "
 4130                        + str(db_hdr_file)
 4131                    )
 4132
 4133                    # Load header as VCF object
 4134                    db_hdr_vcf = Variants(input=db_hdr_file)
 4135                    db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
 4136                    log.debug(
 4137                        "Annotation database header: " + str(db_hdr_vcf_header_infos)
 4138                    )
 4139
 4140                    # For all fields in database
 4141                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
 4142                        annotation_fields = {
 4143                            key: key for key in db_hdr_vcf_header_infos
 4144                        }
 4145                        log.debug(
 4146                            "Annotation database header - All annotations added: "
 4147                            + str(annotation_fields)
 4148                        )
 4149
 4150                    # Number of fields
 4151                    nb_annotation_field = 0
 4152                    annotation_list = []
 4153
 4154                    for annotation_field in annotation_fields:
 4155
 4156                        # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
 4157                        annotation_fields_new_name = annotation_fields.get(
 4158                            annotation_field, annotation_field
 4159                        )
 4160                        if not annotation_fields_new_name:
 4161                            annotation_fields_new_name = annotation_field
 4162
 4163                        # Check if field is in DB and if field is not elready in input data
 4164                        if (
 4165                            annotation_field in db_hdr_vcf.get_header().infos
 4166                            and annotation_fields_new_name
 4167                            not in self.get_header().infos
 4168                        ):
 4169
 4170                            log.info(
 4171                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
 4172                            )
 4173
 4174                            # Add INFO field to header
 4175                            db_hdr_vcf_header_infos_number = (
 4176                                db_hdr_vcf_header_infos[annotation_field].num or "."
 4177                            )
 4178                            db_hdr_vcf_header_infos_type = (
 4179                                db_hdr_vcf_header_infos[annotation_field].type
 4180                                or "String"
 4181                            )
 4182                            db_hdr_vcf_header_infos_description = (
 4183                                db_hdr_vcf_header_infos[annotation_field].desc
 4184                                or f"{annotation_field} description"
 4185                            )
 4186                            db_hdr_vcf_header_infos_source = (
 4187                                db_hdr_vcf_header_infos[annotation_field].source
 4188                                or "unknown"
 4189                            )
 4190                            db_hdr_vcf_header_infos_version = (
 4191                                db_hdr_vcf_header_infos[annotation_field].version
 4192                                or "unknown"
 4193                            )
 4194
 4195                            vcf_reader.infos[annotation_fields_new_name] = (
 4196                                vcf.parser._Info(
 4197                                    annotation_fields_new_name,
 4198                                    db_hdr_vcf_header_infos_number,
 4199                                    db_hdr_vcf_header_infos_type,
 4200                                    db_hdr_vcf_header_infos_description,
 4201                                    db_hdr_vcf_header_infos_source,
 4202                                    db_hdr_vcf_header_infos_version,
 4203                                    self.code_type_map[db_hdr_vcf_header_infos_type],
 4204                                )
 4205                            )
 4206
 4207                            # annotation_list.append(annotation_field)
 4208                            if annotation_field != annotation_fields_new_name:
 4209                                annotation_list.append(
 4210                                    f"{annotation_fields_new_name}:=INFO/{annotation_field}"
 4211                                )
 4212                            else:
 4213                                annotation_list.append(annotation_field)
 4214
 4215                            nb_annotation_field += 1
 4216
 4217                        else:
 4218
 4219                            if annotation_field not in db_hdr_vcf.get_header().infos:
 4220                                log.warning(
 4221                                    f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file"
 4222                                )
 4223                            if annotation_fields_new_name in self.get_header().infos:
 4224                                log.warning(
 4225                                    f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
 4226                                )
 4227
 4228                    log.info(
 4229                        f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
 4230                    )
 4231
 4232                    annotation_infos = ",".join(annotation_list)
 4233
 4234                    if annotation_infos != "":
 4235
 4236                        # Protect header for bcftools (remove "#CHROM" and variants line)
 4237                        log.debug("Protect Header file - remove #CHROM line if exists")
 4238                        tmp_header_vcf = NamedTemporaryFile(
 4239                            prefix=self.get_prefix(),
 4240                            dir=self.get_tmp_dir(),
 4241                            suffix=".hdr",
 4242                            delete=False,
 4243                        )
 4244                        tmp_header_vcf_name = tmp_header_vcf.name
 4245                        tmp_files.append(tmp_header_vcf_name)
 4246                        # Command
 4247                        if db_hdr_file.endswith(".gz"):
 4248                            command_extract_header = f"zcat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
 4249                        else:
 4250                            command_extract_header = f"cat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
 4251                        # Run
 4252                        run_parallel_commands([command_extract_header], 1)
 4253
 4254                        # Find chomosomes
 4255                        log.debug("Find chromosomes ")
 4256                        sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\""""
 4257                        sql_query_chromosomes_df = self.get_query_to_df(
 4258                            sql_query_chromosomes
 4259                        )
 4260                        chomosomes_list = list(sql_query_chromosomes_df["CHROM"])
 4261
 4262                        log.debug("Chromosomes found: " + str(list(chomosomes_list)))
 4263
 4264                        # BED columns in the annotation file
 4265                        if db_file_type in ["bed"]:
 4266                            annotation_infos = "CHROM,POS,POS," + annotation_infos
 4267
 4268                        for chrom in chomosomes_list:
 4269
 4270                            # Create BED on initial VCF
 4271                            log.debug("Create BED on initial VCF: " + str(tmp_vcf_name))
 4272                            tmp_bed = NamedTemporaryFile(
 4273                                prefix=self.get_prefix(),
 4274                                dir=self.get_tmp_dir(),
 4275                                suffix=".bed",
 4276                                delete=False,
 4277                            )
 4278                            tmp_bed_name = tmp_bed.name
 4279                            tmp_files.append(tmp_bed_name)
 4280
 4281                            # Detecte regions
 4282                            log.debug(
 4283                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..."
 4284                            )
 4285                            window = 1000000
 4286                            sql_query_intervals_for_bed = f"""
 4287                                SELECT  \"#CHROM\",
 4288                                        CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END,
 4289                                        \"POS\"+{window}
 4290                                FROM {table_variants} as table_variants
 4291                                WHERE table_variants.\"#CHROM\" = '{chrom}'
 4292                            """
 4293                            regions = self.conn.execute(
 4294                                sql_query_intervals_for_bed
 4295                            ).fetchall()
 4296                            merged_regions = merge_regions(regions)
 4297                            log.debug(
 4298                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..."
 4299                            )
 4300
 4301                            header = ["#CHROM", "START", "END"]
 4302                            with open(tmp_bed_name, "w") as f:
 4303                                # Write the header with tab delimiter
 4304                                f.write("\t".join(header) + "\n")
 4305                                for d in merged_regions:
 4306                                    # Write each data row with tab delimiter
 4307                                    f.write("\t".join(map(str, d)) + "\n")
 4308
 4309                            # Tmp files
 4310                            tmp_annotation_vcf = NamedTemporaryFile(
 4311                                prefix=self.get_prefix(),
 4312                                dir=self.get_tmp_dir(),
 4313                                suffix=".vcf.gz",
 4314                                delete=False,
 4315                            )
 4316                            tmp_annotation_vcf_name = tmp_annotation_vcf.name
 4317                            tmp_files.append(tmp_annotation_vcf_name)
 4318                            tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}")
 4319                            tmp_annotation_vcf_name_err = (
 4320                                tmp_annotation_vcf_name + ".err"
 4321                            )
 4322                            err_files.append(tmp_annotation_vcf_name_err)
 4323
 4324                            # Annotate Command
 4325                            log.debug(
 4326                                f"Annotation '{annotation}' - add bcftools command"
 4327                            )
 4328
 4329                            # Command
 4330                            command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
 4331
 4332                            # Add command
 4333                            commands.append(command_annotate)
 4334
 4335            # if some commands
 4336            if commands:
 4337
 4338                # Export VCF file
 4339                self.export_variant_vcf(
 4340                    vcf_file=tmp_vcf_name,
 4341                    remove_info=True,
 4342                    add_samples=False,
 4343                    index=True,
 4344                )
 4345
 4346                # Threads
 4347                # calculate threads for annotated commands
 4348                if commands:
 4349                    threads_bcftools_annotate = round(threads / len(commands))
 4350                else:
 4351                    threads_bcftools_annotate = 1
 4352
 4353                if not threads_bcftools_annotate:
 4354                    threads_bcftools_annotate = 1
 4355
 4356                # Add threads option to bcftools commands
 4357                if threads_bcftools_annotate > 1:
 4358                    commands_threaded = []
 4359                    for command in commands:
 4360                        commands_threaded.append(
 4361                            command.replace(
 4362                                f"{bcftools_bin_command} annotate ",
 4363                                f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ",
 4364                            )
 4365                        )
 4366                    commands = commands_threaded
 4367
 4368                # Command annotation multithreading
 4369                log.debug(f"Annotation - Annotation commands: " + str(commands))
 4370                log.info(
 4371                    f"Annotation - Annotation multithreaded in "
 4372                    + str(len(commands))
 4373                    + " commands"
 4374                )
 4375
 4376                run_parallel_commands(commands, threads)
 4377
 4378                # Merge
 4379                tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list)
 4380
 4381                if tmp_ann_vcf_list_cmd:
 4382
 4383                    # Tmp file
 4384                    tmp_annotate_vcf = NamedTemporaryFile(
 4385                        prefix=self.get_prefix(),
 4386                        dir=self.get_tmp_dir(),
 4387                        suffix=".vcf.gz",
 4388                        delete=True,
 4389                    )
 4390                    tmp_annotate_vcf_name = tmp_annotate_vcf.name
 4391                    tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
 4392                    err_files.append(tmp_annotate_vcf_name_err)
 4393
 4394                    # Tmp file remove command
 4395                    tmp_files_remove_command = ""
 4396                    if tmp_files:
 4397                        tmp_files_remove_command = " && rm -f " + " ".join(tmp_files)
 4398
 4399                    # Command merge
 4400                    merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}"
 4401                    log.info(
 4402                        f"Annotation - Annotation merging "
 4403                        + str(len(commands))
 4404                        + " annotated files"
 4405                    )
 4406                    log.debug(f"Annotation - merge command: {merge_command}")
 4407                    run_parallel_commands([merge_command], 1)
 4408
 4409                    # Error messages
 4410                    log.info(f"Error/Warning messages:")
 4411                    error_message_command_all = []
 4412                    error_message_command_warning = []
 4413                    error_message_command_err = []
 4414                    for err_file in err_files:
 4415                        with open(err_file, "r") as f:
 4416                            for line in f:
 4417                                message = line.strip()
 4418                                error_message_command_all.append(message)
 4419                                if line.startswith("[W::"):
 4420                                    error_message_command_warning.append(message)
 4421                                if line.startswith("[E::"):
 4422                                    error_message_command_err.append(
 4423                                        f"{err_file}: " + message
 4424                                    )
 4425                    # log info
 4426                    for message in list(
 4427                        set(error_message_command_err + error_message_command_warning)
 4428                    ):
 4429                        log.info(f"   {message}")
 4430                    # debug info
 4431                    for message in list(set(error_message_command_all)):
 4432                        log.debug(f"   {message}")
 4433                    # failed
 4434                    if len(error_message_command_err):
 4435                        log.error("Annotation failed: Error in commands")
 4436                        raise ValueError("Annotation failed: Error in commands")
 4437
 4438                    # Update variants
 4439                    log.info(f"Annotation - Updating...")
 4440                    self.update_from_vcf(tmp_annotate_vcf_name)
 4441
 4442    def annotation_exomiser(self, threads: int = None) -> None:
 4443        """
 4444        This function annotate with Exomiser
 4445
 4446        This function uses args as parameters, in section "annotation" -> "exomiser", with sections:
 4447        - "analysis" (dict/file):
 4448            Full analysis dictionnary parameters (see Exomiser docs).
 4449            Either a dict, or a file in JSON or YAML format.
 4450            These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO)
 4451            Default : None
 4452        - "preset" (string):
 4453            Analysis preset (available in config folder).
 4454            Used if no full "analysis" is provided.
 4455            Default: "exome"
 4456        - "phenopacket" (dict/file):
 4457            Samples and phenotipic features parameters (see Exomiser docs).
 4458            Either a dict, or a file in JSON or YAML format.
 4459            Default: None
 4460        - "subject" (dict):
 4461            Sample parameters (see Exomiser docs).
 4462            Example:
 4463                "subject":
 4464                    {
 4465                        "id": "ISDBM322017",
 4466                        "sex": "FEMALE"
 4467                    }
 4468            Default: None
 4469        - "sample" (string):
 4470            Sample name to construct "subject" section:
 4471                "subject":
 4472                    {
 4473                        "id": "<sample>",
 4474                        "sex": "UNKNOWN_SEX"
 4475                    }
 4476            Default: None
 4477        - "phenotypicFeatures" (dict)
 4478            Phenotypic features to construct "subject" section.
 4479            Example:
 4480                "phenotypicFeatures":
 4481                    [
 4482                        { "type": { "id": "HP:0001159", "label": "Syndactyly" } },
 4483                        { "type": { "id": "HP:0000486", "label": "Strabismus" } }
 4484                    ]
 4485        - "hpo" (list)
 4486            List of HPO ids as phenotypic features.
 4487            Example:
 4488                "hpo": ['0001156', '0001363', '0011304', '0010055']
 4489            Default: []
 4490        - "outputOptions" (dict):
 4491            Output options (see Exomiser docs).
 4492            Default:
 4493                "output_options" =
 4494                    {
 4495                        "outputContributingVariantsOnly": False,
 4496                        "numGenes": 0,
 4497                        "outputFormats": ["TSV_VARIANT", "VCF"]
 4498                    }
 4499        - "transcript_source" (string):
 4500            Transcript source (either "refseq", "ucsc", "ensembl")
 4501            Default: "refseq"
 4502        - "exomiser_to_info" (boolean):
 4503            Add exomiser TSV file columns as INFO fields in VCF.
 4504            Default: False
 4505        - "release" (string):
 4506            Exomise database release.
 4507            If not exists, database release will be downloaded (take a while).
 4508            Default: None (provided by application.properties configuration file)
 4509        - "exomiser_application_properties" (file):
 4510            Exomiser configuration file (see Exomiser docs).
 4511            Useful to automatically download databases (especially for specific genome databases).
 4512
 4513        Notes:
 4514        - If no sample in parameters, first sample in VCF will be chosen
 4515        - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off
 4516
 4517        :param threads: The number of threads to use
 4518        :return: None.
 4519        """
 4520
 4521        # DEBUG
 4522        log.debug("Start annotation with Exomiser databases")
 4523
 4524        # Threads
 4525        if not threads:
 4526            threads = self.get_threads()
 4527        log.debug("Threads: " + str(threads))
 4528
 4529        # Config
 4530        config = self.get_config()
 4531        log.debug("Config: " + str(config))
 4532
 4533        # Config - Folders - Databases
 4534        databases_folders = (
 4535            config.get("folders", {})
 4536            .get("databases", {})
 4537            .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current")
 4538        )
 4539        databases_folders = full_path(databases_folders)
 4540        if not os.path.exists(databases_folders):
 4541            log.error(f"Databases annotations: {databases_folders} NOT found")
 4542        log.debug("Databases annotations: " + str(databases_folders))
 4543
 4544        # Config - Exomiser
 4545        exomiser_bin_command = get_bin_command(
 4546            bin="exomiser-cli*.jar",
 4547            tool="exomiser",
 4548            bin_type="jar",
 4549            config=config,
 4550            default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser",
 4551        )
 4552        log.debug("Exomiser bin command: " + str(exomiser_bin_command))
 4553        if not exomiser_bin_command:
 4554            msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'"
 4555            log.error(msg_err)
 4556            raise ValueError(msg_err)
 4557
 4558        # Param
 4559        param = self.get_param()
 4560        log.debug("Param: " + str(param))
 4561
 4562        # Param - Exomiser
 4563        param_exomiser = param.get("annotation", {}).get("exomiser", {})
 4564        log.debug(f"Param Exomiser: {param_exomiser}")
 4565
 4566        # Param - Assembly
 4567        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 4568        log.debug("Assembly: " + str(assembly))
 4569
 4570        # Data
 4571        table_variants = self.get_table_variants()
 4572
 4573        # Check if not empty
 4574        log.debug("Check if not empty")
 4575        sql_query_chromosomes = (
 4576            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 4577        )
 4578        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
 4579            log.info(f"VCF empty")
 4580            return False
 4581
 4582        # VCF header
 4583        vcf_reader = self.get_header()
 4584        log.debug("Initial header: " + str(vcf_reader.infos))
 4585
 4586        # Samples
 4587        samples = self.get_header_sample_list()
 4588        if not samples:
 4589            log.error("No Samples in VCF")
 4590            return False
 4591        log.debug(f"Samples: {samples}")
 4592
 4593        # Memory limit
 4594        memory_limit = self.get_memory("8G")
 4595        log.debug(f"memory_limit: {memory_limit}")
 4596
 4597        # Exomiser java options
 4598        exomiser_java_options = (
 4599            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
 4600        )
 4601        log.debug(f"Exomiser java options: {exomiser_java_options}")
 4602
 4603        # Download Exomiser (if not exists)
 4604        exomiser_release = param_exomiser.get("release", None)
 4605        exomiser_application_properties = param_exomiser.get(
 4606            "exomiser_application_properties", None
 4607        )
 4608        databases_download_exomiser(
 4609            assemblies=[assembly],
 4610            exomiser_folder=databases_folders,
 4611            exomiser_release=exomiser_release,
 4612            exomiser_phenotype_release=exomiser_release,
 4613            exomiser_application_properties=exomiser_application_properties,
 4614        )
 4615
 4616        # Force annotation
 4617        force_update_annotation = True
 4618
 4619        if "Exomiser" not in self.get_header().infos or force_update_annotation:
 4620            log.debug("Start annotation Exomiser")
 4621
 4622            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
 4623
 4624                # tmp_dir = "/tmp/exomiser"
 4625
 4626                ### ANALYSIS ###
 4627                ################
 4628
 4629                # Create analysis.json through analysis dict
 4630                # either analysis in param or by default
 4631                # depending on preset exome/genome)
 4632
 4633                # Init analysis dict
 4634                param_exomiser_analysis_dict = {}
 4635
 4636                # analysis from param
 4637                param_exomiser_analysis = param_exomiser.get("analysis", {})
 4638                param_exomiser_analysis = full_path(param_exomiser_analysis)
 4639
 4640                # If analysis in param -> load anlaysis json
 4641                if param_exomiser_analysis:
 4642
 4643                    # If param analysis is a file and exists
 4644                    if isinstance(param_exomiser_analysis, str) and os.path.exists(
 4645                        param_exomiser_analysis
 4646                    ):
 4647                        # Load analysis file into analysis dict (either yaml or json)
 4648                        with open(param_exomiser_analysis) as json_file:
 4649                            param_exomiser_analysis_dict = yaml.safe_load(json_file)
 4650
 4651                    # If param analysis is a dict
 4652                    elif isinstance(param_exomiser_analysis, dict):
 4653                        # Load analysis dict into analysis dict (either yaml or json)
 4654                        param_exomiser_analysis_dict = param_exomiser_analysis
 4655
 4656                    # Error analysis type
 4657                    else:
 4658                        log.error(f"Analysis type unknown. Check param file.")
 4659                        raise ValueError(f"Analysis type unknown. Check param file.")
 4660
 4661                # Case no input analysis config file/dict
 4662                # Use preset (exome/genome) to open default config file
 4663                if not param_exomiser_analysis_dict:
 4664
 4665                    # default preset
 4666                    default_preset = "exome"
 4667
 4668                    # Get param preset or default preset
 4669                    param_exomiser_preset = param_exomiser.get("preset", default_preset)
 4670
 4671                    # Try to find if preset is a file
 4672                    if os.path.exists(param_exomiser_preset):
 4673                        # Preset file is provided in full path
 4674                        param_exomiser_analysis_default_config_file = (
 4675                            param_exomiser_preset
 4676                        )
 4677                    # elif os.path.exists(full_path(param_exomiser_preset)):
 4678                    #     # Preset file is provided in full path
 4679                    #     param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset)
 4680                    elif os.path.exists(
 4681                        os.path.join(folder_config, param_exomiser_preset)
 4682                    ):
 4683                        # Preset file is provided a basename in config folder (can be a path with subfolders)
 4684                        param_exomiser_analysis_default_config_file = os.path.join(
 4685                            folder_config, param_exomiser_preset
 4686                        )
 4687                    else:
 4688                        # Construct preset file
 4689                        param_exomiser_analysis_default_config_file = os.path.join(
 4690                            folder_config,
 4691                            f"preset-{param_exomiser_preset}-analysis.json",
 4692                        )
 4693
 4694                    # If preset file exists
 4695                    param_exomiser_analysis_default_config_file = full_path(
 4696                        param_exomiser_analysis_default_config_file
 4697                    )
 4698                    if os.path.exists(param_exomiser_analysis_default_config_file):
 4699                        # Load prest file into analysis dict (either yaml or json)
 4700                        with open(
 4701                            param_exomiser_analysis_default_config_file
 4702                        ) as json_file:
 4703                            # param_exomiser_analysis_dict[""] = json.load(json_file)
 4704                            param_exomiser_analysis_dict["analysis"] = yaml.safe_load(
 4705                                json_file
 4706                            )
 4707
 4708                    # Error preset file
 4709                    else:
 4710                        log.error(
 4711                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
 4712                        )
 4713                        raise ValueError(
 4714                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
 4715                        )
 4716
 4717                # If no analysis dict created
 4718                if not param_exomiser_analysis_dict:
 4719                    log.error(f"No analysis config")
 4720                    raise ValueError(f"No analysis config")
 4721
 4722                # Log
 4723                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
 4724
 4725                ### PHENOPACKET ###
 4726                ###################
 4727
 4728                # If no PhenoPacket in analysis dict -> check in param
 4729                if "phenopacket" not in param_exomiser_analysis_dict:
 4730
 4731                    # If PhenoPacket in param -> load anlaysis json
 4732                    if param_exomiser.get("phenopacket", None):
 4733
 4734                        param_exomiser_phenopacket = param_exomiser.get("phenopacket")
 4735                        param_exomiser_phenopacket = full_path(
 4736                            param_exomiser_phenopacket
 4737                        )
 4738
 4739                        # If param phenopacket is a file and exists
 4740                        if isinstance(
 4741                            param_exomiser_phenopacket, str
 4742                        ) and os.path.exists(param_exomiser_phenopacket):
 4743                            # Load phenopacket file into analysis dict (either yaml or json)
 4744                            with open(param_exomiser_phenopacket) as json_file:
 4745                                param_exomiser_analysis_dict["phenopacket"] = (
 4746                                    yaml.safe_load(json_file)
 4747                                )
 4748
 4749                        # If param phenopacket is a dict
 4750                        elif isinstance(param_exomiser_phenopacket, dict):
 4751                            # Load phenopacket dict into analysis dict (either yaml or json)
 4752                            param_exomiser_analysis_dict["phenopacket"] = (
 4753                                param_exomiser_phenopacket
 4754                            )
 4755
 4756                        # Error phenopacket type
 4757                        else:
 4758                            log.error(f"Phenopacket type unknown. Check param file.")
 4759                            raise ValueError(
 4760                                f"Phenopacket type unknown. Check param file."
 4761                            )
 4762
 4763                # If no PhenoPacket in analysis dict -> construct from sample and HPO in param
 4764                if "phenopacket" not in param_exomiser_analysis_dict:
 4765
 4766                    # Init PhenoPacket
 4767                    param_exomiser_analysis_dict["phenopacket"] = {
 4768                        "id": "analysis",
 4769                        "proband": {},
 4770                    }
 4771
 4772                    ### Add subject ###
 4773
 4774                    # If subject exists
 4775                    param_exomiser_subject = param_exomiser.get("subject", {})
 4776
 4777                    # If subject not exists -> found sample ID
 4778                    if not param_exomiser_subject:
 4779
 4780                        # Found sample ID in param
 4781                        sample = param_exomiser.get("sample", None)
 4782
 4783                        # Find sample ID (first sample)
 4784                        if not sample:
 4785                            sample_list = self.get_header_sample_list()
 4786                            if len(sample_list) > 0:
 4787                                sample = sample_list[0]
 4788                            else:
 4789                                log.error(f"No sample found")
 4790                                raise ValueError(f"No sample found")
 4791
 4792                        # Create subject
 4793                        param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"}
 4794
 4795                    # Add to dict
 4796                    param_exomiser_analysis_dict["phenopacket"][
 4797                        "subject"
 4798                    ] = param_exomiser_subject
 4799
 4800                    ### Add "phenotypicFeatures" ###
 4801
 4802                    # If phenotypicFeatures exists
 4803                    param_exomiser_phenotypicfeatures = param_exomiser.get(
 4804                        "phenotypicFeatures", []
 4805                    )
 4806
 4807                    # If phenotypicFeatures not exists -> Try to infer from hpo list
 4808                    if not param_exomiser_phenotypicfeatures:
 4809
 4810                        # Found HPO in param
 4811                        param_exomiser_hpo = param_exomiser.get("hpo", [])
 4812
 4813                        # Split HPO if list in string format separated by comma
 4814                        if isinstance(param_exomiser_hpo, str):
 4815                            param_exomiser_hpo = param_exomiser_hpo.split(",")
 4816
 4817                        # Create HPO list
 4818                        for hpo in param_exomiser_hpo:
 4819                            hpo_clean = re.sub("[^0-9]", "", hpo)
 4820                            param_exomiser_phenotypicfeatures.append(
 4821                                {
 4822                                    "type": {
 4823                                        "id": f"HP:{hpo_clean}",
 4824                                        "label": f"HP:{hpo_clean}",
 4825                                    }
 4826                                }
 4827                            )
 4828
 4829                    # Add to dict
 4830                    param_exomiser_analysis_dict["phenopacket"][
 4831                        "phenotypicFeatures"
 4832                    ] = param_exomiser_phenotypicfeatures
 4833
 4834                    # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step
 4835                    if not param_exomiser_phenotypicfeatures:
 4836                        for step in param_exomiser_analysis_dict.get(
 4837                            "analysis", {}
 4838                        ).get("steps", []):
 4839                            if "hiPhivePrioritiser" in step:
 4840                                param_exomiser_analysis_dict.get("analysis", {}).get(
 4841                                    "steps", []
 4842                                ).remove(step)
 4843
 4844                ### Add Input File ###
 4845
 4846                # Initial file name and htsFiles
 4847                tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz")
 4848                param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [
 4849                    {
 4850                        "uri": tmp_vcf_name,
 4851                        "htsFormat": "VCF",
 4852                        "genomeAssembly": assembly,
 4853                    }
 4854                ]
 4855
 4856                ### Add metaData ###
 4857
 4858                # If metaData not in analysis dict
 4859                if "metaData" not in param_exomiser_analysis_dict:
 4860                    param_exomiser_analysis_dict["phenopacket"]["metaData"] = {
 4861                        "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z",
 4862                        "createdBy": "howard",
 4863                        "phenopacketSchemaVersion": 1,
 4864                    }
 4865
 4866                ### OutputOptions ###
 4867
 4868                # Init output result folder
 4869                output_results = os.path.join(tmp_dir, "results")
 4870
 4871                # If no outputOptions in analysis dict
 4872                if "outputOptions" not in param_exomiser_analysis_dict:
 4873
 4874                    # default output formats
 4875                    defaut_output_formats = ["TSV_VARIANT", "VCF"]
 4876
 4877                    # Get outputOptions in param
 4878                    output_options = param_exomiser.get("outputOptions", None)
 4879
 4880                    # If no output_options in param -> check
 4881                    if not output_options:
 4882                        output_options = {
 4883                            "outputContributingVariantsOnly": False,
 4884                            "numGenes": 0,
 4885                            "outputFormats": defaut_output_formats,
 4886                        }
 4887
 4888                    # Replace outputDirectory in output options
 4889                    output_options["outputDirectory"] = output_results
 4890                    output_options["outputFileName"] = "howard"
 4891
 4892                    # Add outputOptions in analysis dict
 4893                    param_exomiser_analysis_dict["outputOptions"] = output_options
 4894
 4895                else:
 4896
 4897                    # Replace output_results and output format (if exists in param)
 4898                    param_exomiser_analysis_dict["outputOptions"][
 4899                        "outputDirectory"
 4900                    ] = output_results
 4901                    param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = (
 4902                        list(
 4903                            set(
 4904                                param_exomiser_analysis_dict.get(
 4905                                    "outputOptions", {}
 4906                                ).get("outputFormats", [])
 4907                                + ["TSV_VARIANT", "VCF"]
 4908                            )
 4909                        )
 4910                    )
 4911
 4912                # log
 4913                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
 4914
 4915                ### ANALYSIS FILE ###
 4916                #####################
 4917
 4918                ### Full JSON analysis config file ###
 4919
 4920                exomiser_analysis = os.path.join(tmp_dir, "analysis.json")
 4921                with open(exomiser_analysis, "w") as fp:
 4922                    json.dump(param_exomiser_analysis_dict, fp, indent=4)
 4923
 4924                ### SPLIT analysis and sample config files
 4925
 4926                # Splitted analysis dict
 4927                param_exomiser_analysis_dict_for_split = (
 4928                    param_exomiser_analysis_dict.copy()
 4929                )
 4930
 4931                # Phenopacket JSON file
 4932                exomiser_analysis_phenopacket = os.path.join(
 4933                    tmp_dir, "analysis_phenopacket.json"
 4934                )
 4935                with open(exomiser_analysis_phenopacket, "w") as fp:
 4936                    json.dump(
 4937                        param_exomiser_analysis_dict_for_split.get("phenopacket"),
 4938                        fp,
 4939                        indent=4,
 4940                    )
 4941
 4942                # Analysis JSON file without Phenopacket parameters
 4943                param_exomiser_analysis_dict_for_split.pop("phenopacket")
 4944                exomiser_analysis_analysis = os.path.join(
 4945                    tmp_dir, "analysis_analysis.json"
 4946                )
 4947                with open(exomiser_analysis_analysis, "w") as fp:
 4948                    json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4)
 4949
 4950                ### INITAL VCF file ###
 4951                #######################
 4952
 4953                ### Create list of samples to use and include inti initial VCF file ####
 4954
 4955                # Subject (main sample)
 4956                # Get sample ID in analysis dict
 4957                sample_subject = (
 4958                    param_exomiser_analysis_dict.get("phenopacket", {})
 4959                    .get("subject", {})
 4960                    .get("id", None)
 4961                )
 4962                sample_proband = (
 4963                    param_exomiser_analysis_dict.get("phenopacket", {})
 4964                    .get("proband", {})
 4965                    .get("subject", {})
 4966                    .get("id", None)
 4967                )
 4968                sample = []
 4969                if sample_subject:
 4970                    sample.append(sample_subject)
 4971                if sample_proband:
 4972                    sample.append(sample_proband)
 4973
 4974                # Get sample ID within Pedigree
 4975                pedigree_persons_list = (
 4976                    param_exomiser_analysis_dict.get("phenopacket", {})
 4977                    .get("pedigree", {})
 4978                    .get("persons", {})
 4979                )
 4980
 4981                # Create list with all sample ID in pedigree (if exists)
 4982                pedigree_persons = []
 4983                for person in pedigree_persons_list:
 4984                    pedigree_persons.append(person.get("individualId"))
 4985
 4986                # Concat subject sample ID and samples ID in pedigreesamples
 4987                samples = list(set(sample + pedigree_persons))
 4988
 4989                # Check if sample list is not empty
 4990                if not samples:
 4991                    log.error(f"No samples found")
 4992                    raise ValueError(f"No samples found")
 4993
 4994                # Create VCF with sample (either sample in param or first one by default)
 4995                # Export VCF file
 4996                self.export_variant_vcf(
 4997                    vcf_file=tmp_vcf_name,
 4998                    remove_info=True,
 4999                    add_samples=True,
 5000                    list_samples=samples,
 5001                    index=False,
 5002                )
 5003
 5004                ### Execute Exomiser ###
 5005                ########################
 5006
 5007                # Init command
 5008                exomiser_command = ""
 5009
 5010                # Command exomiser options
 5011                exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} "
 5012
 5013                # Release
 5014                exomiser_release = param_exomiser.get("release", None)
 5015                if exomiser_release:
 5016                    # phenotype data version
 5017                    exomiser_options += (
 5018                        f" --exomiser.phenotype.data-version={exomiser_release} "
 5019                    )
 5020                    # data version
 5021                    exomiser_options += (
 5022                        f" --exomiser.{assembly}.data-version={exomiser_release} "
 5023                    )
 5024                    # variant white list
 5025                    variant_white_list_file = (
 5026                        f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz"
 5027                    )
 5028                    if os.path.exists(
 5029                        os.path.join(
 5030                            databases_folders, assembly, variant_white_list_file
 5031                        )
 5032                    ):
 5033                        exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} "
 5034
 5035                # transcript_source
 5036                transcript_source = param_exomiser.get(
 5037                    "transcript_source", None
 5038                )  # ucsc, refseq, ensembl
 5039                if transcript_source:
 5040                    exomiser_options += (
 5041                        f" --exomiser.{assembly}.transcript-source={transcript_source} "
 5042                    )
 5043
 5044                # If analysis contain proband param
 5045                if param_exomiser_analysis_dict.get("phenopacket", {}).get(
 5046                    "proband", {}
 5047                ):
 5048                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} "
 5049
 5050                # If no proband (usually uniq sample)
 5051                else:
 5052                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}"
 5053
 5054                # Log
 5055                log.debug(f"exomiser_command_analysis={exomiser_command_analysis}")
 5056
 5057                # Run command
 5058                result = subprocess.call(
 5059                    exomiser_command_analysis.split(), stdout=subprocess.PIPE
 5060                )
 5061                if result:
 5062                    log.error("Exomiser command failed")
 5063                    raise ValueError("Exomiser command failed")
 5064
 5065                ### RESULTS ###
 5066                ###############
 5067
 5068                ### Annotate with TSV fields ###
 5069
 5070                # Init result tsv file
 5071                exomiser_to_info = param_exomiser.get("exomiser_to_info", False)
 5072
 5073                # Init result tsv file
 5074                output_results_tsv = os.path.join(output_results, "howard.variants.tsv")
 5075
 5076                # Parse TSV file and explode columns in INFO field
 5077                if exomiser_to_info and os.path.exists(output_results_tsv):
 5078
 5079                    # Log
 5080                    log.debug("Exomiser columns to VCF INFO field")
 5081
 5082                    # Retrieve columns and types
 5083                    query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """
 5084                    output_results_tsv_df = self.get_query_to_df(query)
 5085                    output_results_tsv_columns = output_results_tsv_df.columns.tolist()
 5086
 5087                    # Init concat fields for update
 5088                    sql_query_update_concat_fields = []
 5089
 5090                    # Fields to avoid
 5091                    fields_to_avoid = [
 5092                        "CONTIG",
 5093                        "START",
 5094                        "END",
 5095                        "REF",
 5096                        "ALT",
 5097                        "QUAL",
 5098                        "FILTER",
 5099                        "GENOTYPE",
 5100                    ]
 5101
 5102                    # List all columns to add into header
 5103                    for header_column in output_results_tsv_columns:
 5104
 5105                        # If header column is enable
 5106                        if header_column not in fields_to_avoid:
 5107
 5108                            # Header info type
 5109                            header_info_type = "String"
 5110                            header_column_df = output_results_tsv_df[header_column]
 5111                            header_column_df_dtype = header_column_df.dtype
 5112                            if header_column_df_dtype == object:
 5113                                if (
 5114                                    pd.to_numeric(header_column_df, errors="coerce")
 5115                                    .notnull()
 5116                                    .all()
 5117                                ):
 5118                                    header_info_type = "Float"
 5119                            else:
 5120                                header_info_type = "Integer"
 5121
 5122                            # Header info
 5123                            characters_to_validate = ["-"]
 5124                            pattern = "[" + "".join(characters_to_validate) + "]"
 5125                            header_info_name = re.sub(
 5126                                pattern,
 5127                                "_",
 5128                                f"Exomiser_{header_column}".replace("#", ""),
 5129                            )
 5130                            header_info_number = "."
 5131                            header_info_description = (
 5132                                f"Exomiser {header_column} annotation"
 5133                            )
 5134                            header_info_source = "Exomiser"
 5135                            header_info_version = "unknown"
 5136                            header_info_code = CODE_TYPE_MAP[header_info_type]
 5137                            vcf_reader.infos[header_info_name] = vcf.parser._Info(
 5138                                header_info_name,
 5139                                header_info_number,
 5140                                header_info_type,
 5141                                header_info_description,
 5142                                header_info_source,
 5143                                header_info_version,
 5144                                header_info_code,
 5145                            )
 5146
 5147                            # Add field to add for update to concat fields
 5148                            sql_query_update_concat_fields.append(
 5149                                f"""
 5150                                CASE
 5151                                    WHEN table_parquet."{header_column}" NOT IN ('','.')
 5152                                    THEN concat(
 5153                                        '{header_info_name}=',
 5154                                        table_parquet."{header_column}",
 5155                                        ';'
 5156                                        )
 5157
 5158                                    ELSE ''
 5159                                END
 5160                            """
 5161                            )
 5162
 5163                    # Update query
 5164                    sql_query_update = f"""
 5165                        UPDATE {table_variants} as table_variants
 5166                            SET INFO = concat(
 5167                                            CASE
 5168                                                WHEN INFO NOT IN ('', '.')
 5169                                                THEN INFO
 5170                                                ELSE ''
 5171                                            END,
 5172                                            CASE
 5173                                                WHEN table_variants.INFO NOT IN ('','.')
 5174                                                THEN ';'
 5175                                                ELSE ''
 5176                                            END,
 5177                                            (
 5178                                            SELECT 
 5179                                                concat(
 5180                                                    {",".join(sql_query_update_concat_fields)}
 5181                                                )
 5182                                            FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet
 5183                                                    WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\"
 5184                                                    AND table_parquet.\"START\" = table_variants.\"POS\"
 5185                                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
 5186                                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
 5187                                            )
 5188                                        )
 5189                            ;
 5190                        """
 5191
 5192                    # Update
 5193                    self.conn.execute(sql_query_update)
 5194
 5195                ### Annotate with VCF INFO field ###
 5196
 5197                # Init result VCF file
 5198                output_results_vcf = os.path.join(output_results, "howard.vcf.gz")
 5199
 5200                # If VCF exists
 5201                if os.path.exists(output_results_vcf):
 5202
 5203                    # Log
 5204                    log.debug("Exomiser result VCF update variants")
 5205
 5206                    # Find Exomiser INFO field annotation in header
 5207                    with gzip.open(output_results_vcf, "rt") as f:
 5208                        header_list = self.read_vcf_header(f)
 5209                    exomiser_vcf_header = vcf.Reader(
 5210                        io.StringIO("\n".join(header_list))
 5211                    )
 5212
 5213                    # Add annotation INFO field to header
 5214                    vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"]
 5215
 5216                    # Update variants with VCF
 5217                    self.update_from_vcf(output_results_vcf)
 5218
 5219        return True
 5220
 5221    def annotation_snpeff(self, threads: int = None) -> None:
 5222        """
 5223        This function annotate with snpEff
 5224
 5225        :param threads: The number of threads to use
 5226        :return: the value of the variable "return_value".
 5227        """
 5228
 5229        # DEBUG
 5230        log.debug("Start annotation with snpeff databases")
 5231
 5232        # Threads
 5233        if not threads:
 5234            threads = self.get_threads()
 5235        log.debug("Threads: " + str(threads))
 5236
 5237        # DEBUG
 5238        delete_tmp = True
 5239        if self.get_config().get("verbosity", "warning") in ["debug"]:
 5240            delete_tmp = False
 5241            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 5242
 5243        # Config
 5244        config = self.get_config()
 5245        log.debug("Config: " + str(config))
 5246
 5247        # Config - Folders - Databases
 5248        databases_folders = (
 5249            config.get("folders", {}).get("databases", {}).get("snpeff", ["."])
 5250        )
 5251        log.debug("Databases annotations: " + str(databases_folders))
 5252
 5253        # Config - snpEff bin command
 5254        snpeff_bin_command = get_bin_command(
 5255            bin="snpEff.jar",
 5256            tool="snpeff",
 5257            bin_type="jar",
 5258            config=config,
 5259            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
 5260        )
 5261        if not snpeff_bin_command:
 5262            msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'"
 5263            log.error(msg_err)
 5264            raise ValueError(msg_err)
 5265
 5266        # Config - snpEff databases
 5267        snpeff_databases = (
 5268            config.get("folders", {})
 5269            .get("databases", {})
 5270            .get("snpeff", DEFAULT_SNPEFF_FOLDER)
 5271        )
 5272        snpeff_databases = full_path(snpeff_databases)
 5273        if snpeff_databases is not None and snpeff_databases != "":
 5274            log.debug(f"Create snpEff databases folder")
 5275            if not os.path.exists(snpeff_databases):
 5276                os.makedirs(snpeff_databases)
 5277
 5278        # Param
 5279        param = self.get_param()
 5280        log.debug("Param: " + str(param))
 5281
 5282        # Param
 5283        options = param.get("annotation", {}).get("snpeff", {}).get("options", None)
 5284        log.debug("Options: " + str(options))
 5285
 5286        # Param - Assembly
 5287        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 5288
 5289        # Param - Options
 5290        snpeff_options = (
 5291            param.get("annotation", {}).get("snpeff", {}).get("options", "")
 5292        )
 5293        snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None)
 5294        snpeff_csvstats = (
 5295            param.get("annotation", {}).get("snpeff", {}).get("csvStats", None)
 5296        )
 5297        if snpeff_stats:
 5298            snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output())
 5299            snpeff_stats = full_path(snpeff_stats)
 5300            snpeff_options += f" -stats {snpeff_stats}"
 5301        if snpeff_csvstats:
 5302            snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output())
 5303            snpeff_csvstats = full_path(snpeff_csvstats)
 5304            snpeff_options += f" -csvStats {snpeff_csvstats}"
 5305
 5306        # Data
 5307        table_variants = self.get_table_variants()
 5308
 5309        # Check if not empty
 5310        log.debug("Check if not empty")
 5311        sql_query_chromosomes = (
 5312            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 5313        )
 5314        # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]:
 5315        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
 5316            log.info(f"VCF empty")
 5317            return
 5318
 5319        # Export in VCF
 5320        log.debug("Create initial file to annotate")
 5321        tmp_vcf = NamedTemporaryFile(
 5322            prefix=self.get_prefix(),
 5323            dir=self.get_tmp_dir(),
 5324            suffix=".vcf.gz",
 5325            delete=True,
 5326        )
 5327        tmp_vcf_name = tmp_vcf.name
 5328
 5329        # VCF header
 5330        vcf_reader = self.get_header()
 5331        log.debug("Initial header: " + str(vcf_reader.infos))
 5332
 5333        # Existing annotations
 5334        for vcf_annotation in self.get_header().infos:
 5335
 5336            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 5337            log.debug(
 5338                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 5339            )
 5340
 5341        # Memory limit
 5342        # if config.get("memory", None):
 5343        #     memory_limit = config.get("memory", "8G")
 5344        # else:
 5345        #     memory_limit = "8G"
 5346        memory_limit = self.get_memory("8G")
 5347        log.debug(f"memory_limit: {memory_limit}")
 5348
 5349        # snpEff java options
 5350        snpeff_java_options = (
 5351            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
 5352        )
 5353        log.debug(f"Exomiser java options: {snpeff_java_options}")
 5354
 5355        force_update_annotation = True
 5356
 5357        if "ANN" not in self.get_header().infos or force_update_annotation:
 5358
 5359            # Check snpEff database
 5360            log.debug(f"Check snpEff databases {[assembly]}")
 5361            databases_download_snpeff(
 5362                folder=snpeff_databases, assemblies=[assembly], config=config
 5363            )
 5364
 5365            # Export VCF file
 5366            self.export_variant_vcf(
 5367                vcf_file=tmp_vcf_name,
 5368                remove_info=True,
 5369                add_samples=False,
 5370                index=True,
 5371            )
 5372
 5373            # Tmp file
 5374            err_files = []
 5375            tmp_annotate_vcf = NamedTemporaryFile(
 5376                prefix=self.get_prefix(),
 5377                dir=self.get_tmp_dir(),
 5378                suffix=".vcf",
 5379                delete=False,
 5380            )
 5381            tmp_annotate_vcf_name = tmp_annotate_vcf.name
 5382            tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
 5383            err_files.append(tmp_annotate_vcf_name_err)
 5384
 5385            # Command
 5386            snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}"
 5387            log.debug(f"Annotation - snpEff command: {snpeff_command}")
 5388            run_parallel_commands([snpeff_command], 1)
 5389
 5390            # Error messages
 5391            log.info(f"Error/Warning messages:")
 5392            error_message_command_all = []
 5393            error_message_command_warning = []
 5394            error_message_command_err = []
 5395            for err_file in err_files:
 5396                with open(err_file, "r") as f:
 5397                    for line in f:
 5398                        message = line.strip()
 5399                        error_message_command_all.append(message)
 5400                        if line.startswith("[W::"):
 5401                            error_message_command_warning.append(message)
 5402                        if line.startswith("[E::"):
 5403                            error_message_command_err.append(f"{err_file}: " + message)
 5404            # log info
 5405            for message in list(
 5406                set(error_message_command_err + error_message_command_warning)
 5407            ):
 5408                log.info(f"   {message}")
 5409            # debug info
 5410            for message in list(set(error_message_command_all)):
 5411                log.debug(f"   {message}")
 5412            # failed
 5413            if len(error_message_command_err):
 5414                log.error("Annotation failed: Error in commands")
 5415                raise ValueError("Annotation failed: Error in commands")
 5416
 5417            # Find annotation in header
 5418            with open(tmp_annotate_vcf_name, "rt") as f:
 5419                header_list = self.read_vcf_header(f)
 5420            annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
 5421
 5422            for ann in annovar_vcf_header.infos:
 5423                if ann not in self.get_header().infos:
 5424                    vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
 5425
 5426            # Update variants
 5427            log.info(f"Annotation - Updating...")
 5428            self.update_from_vcf(tmp_annotate_vcf_name)
 5429
 5430        else:
 5431            if "ANN" in self.get_header().infos:
 5432                log.debug(f"Existing snpEff annotations in VCF")
 5433            if force_update_annotation:
 5434                log.debug(f"Existing snpEff annotations in VCF - annotation forced")
 5435
 5436    def annotation_annovar(self, threads: int = None) -> None:
 5437        """
 5438        It takes a VCF file, annotates it with Annovar, and then updates the database with the new
 5439        annotations
 5440
 5441        :param threads: number of threads to use
 5442        :return: the value of the variable "return_value".
 5443        """
 5444
 5445        # DEBUG
 5446        log.debug("Start annotation with Annovar databases")
 5447
 5448        # Threads
 5449        if not threads:
 5450            threads = self.get_threads()
 5451        log.debug("Threads: " + str(threads))
 5452
 5453        # Tmp en Err files
 5454        tmp_files = []
 5455        err_files = []
 5456
 5457        # DEBUG
 5458        delete_tmp = True
 5459        if self.get_config().get("verbosity", "warning") in ["debug"]:
 5460            delete_tmp = False
 5461            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 5462
 5463        # Config
 5464        config = self.get_config()
 5465        log.debug("Config: " + str(config))
 5466
 5467        # Config - Folders - Databases
 5468        databases_folders = (
 5469            config.get("folders", {}).get("databases", {}).get("annovar", ["."])
 5470        )
 5471        log.debug("Databases annotations: " + str(databases_folders))
 5472
 5473        # Config - annovar bin command
 5474        annovar_bin_command = get_bin_command(
 5475            bin="table_annovar.pl",
 5476            tool="annovar",
 5477            bin_type="perl",
 5478            config=config,
 5479            default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar",
 5480        )
 5481        if not annovar_bin_command:
 5482            msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'"
 5483            log.error(msg_err)
 5484            raise ValueError(msg_err)
 5485
 5486        # Config - BCFTools bin command
 5487        bcftools_bin_command = get_bin_command(
 5488            bin="bcftools",
 5489            tool="bcftools",
 5490            bin_type="bin",
 5491            config=config,
 5492            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
 5493        )
 5494        if not bcftools_bin_command:
 5495            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
 5496            log.error(msg_err)
 5497            raise ValueError(msg_err)
 5498
 5499        # Config - annovar databases
 5500        annovar_databases = (
 5501            config.get("folders", {})
 5502            .get("databases", {})
 5503            .get("annovar", DEFAULT_ANNOVAR_FOLDER)
 5504        )
 5505        if annovar_databases is not None:
 5506            if isinstance(annovar_databases, list):
 5507                annovar_databases = full_path(annovar_databases[0])
 5508                log.warning(f"Annovar databases folder '{annovar_databases}' selected")
 5509            annovar_databases = full_path(annovar_databases)
 5510            if not os.path.exists(annovar_databases):
 5511                log.info(f"Annovar databases folder '{annovar_databases}' created")
 5512                Path(annovar_databases).mkdir(parents=True, exist_ok=True)
 5513        else:
 5514            msg_err = f"Annovar databases configuration failed"
 5515            log.error(msg_err)
 5516            raise ValueError(msg_err)
 5517
 5518        # Param
 5519        param = self.get_param()
 5520        log.debug("Param: " + str(param))
 5521
 5522        # Param - options
 5523        options = param.get("annotation", {}).get("annovar", {}).get("options", {})
 5524        log.debug("Options: " + str(options))
 5525
 5526        # Param - annotations
 5527        annotations = (
 5528            param.get("annotation", {}).get("annovar", {}).get("annotations", {})
 5529        )
 5530        log.debug("Annotations: " + str(annotations))
 5531
 5532        # Param - Assembly
 5533        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 5534
 5535        # Annovar database assembly
 5536        annovar_databases_assembly = f"{annovar_databases}/{assembly}"
 5537        if annovar_databases_assembly != "" and not os.path.exists(
 5538            annovar_databases_assembly
 5539        ):
 5540            os.makedirs(annovar_databases_assembly)
 5541
 5542        # Data
 5543        table_variants = self.get_table_variants()
 5544
 5545        # Check if not empty
 5546        log.debug("Check if not empty")
 5547        sql_query_chromosomes = (
 5548            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 5549        )
 5550        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 5551        if not sql_query_chromosomes_df["count"][0]:
 5552            log.info(f"VCF empty")
 5553            return
 5554
 5555        # VCF header
 5556        vcf_reader = self.get_header()
 5557        log.debug("Initial header: " + str(vcf_reader.infos))
 5558
 5559        # Existing annotations
 5560        for vcf_annotation in self.get_header().infos:
 5561
 5562            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 5563            log.debug(
 5564                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 5565            )
 5566
 5567        force_update_annotation = True
 5568
 5569        if annotations:
 5570
 5571            commands = []
 5572            tmp_annotates_vcf_name_list = []
 5573
 5574            # Export in VCF
 5575            log.debug("Create initial file to annotate")
 5576            tmp_vcf = NamedTemporaryFile(
 5577                prefix=self.get_prefix(),
 5578                dir=self.get_tmp_dir(),
 5579                suffix=".vcf.gz",
 5580                delete=False,
 5581            )
 5582            tmp_vcf_name = tmp_vcf.name
 5583            tmp_files.append(tmp_vcf_name)
 5584            tmp_files.append(tmp_vcf_name + ".tbi")
 5585
 5586            # Export VCF file
 5587            self.export_variant_vcf(
 5588                vcf_file=tmp_vcf_name,
 5589                remove_info=".",
 5590                add_samples=False,
 5591                index=True,
 5592            )
 5593
 5594            # Create file for field rename
 5595            log.debug("Create file for field rename")
 5596            tmp_rename = NamedTemporaryFile(
 5597                prefix=self.get_prefix(),
 5598                dir=self.get_tmp_dir(),
 5599                suffix=".rename",
 5600                delete=False,
 5601            )
 5602            tmp_rename_name = tmp_rename.name
 5603            tmp_files.append(tmp_rename_name)
 5604
 5605            # Check Annovar database
 5606            log.debug(
 5607                f"Check Annovar databases {[assembly]}: {list(annotations.keys())}"
 5608            )
 5609            databases_download_annovar(
 5610                folder=annovar_databases,
 5611                files=list(annotations.keys()),
 5612                assemblies=[assembly],
 5613            )
 5614
 5615            for annotation in annotations:
 5616                annotation_fields = annotations[annotation]
 5617
 5618                if not annotation_fields:
 5619                    annotation_fields = {"INFO": None}
 5620
 5621                log.info(f"Annotations Annovar - database '{annotation}'")
 5622                log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}")
 5623
 5624                # Tmp file for annovar
 5625                err_files = []
 5626                tmp_annotate_vcf_directory = TemporaryDirectory(
 5627                    prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar"
 5628                )
 5629                tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar"
 5630                tmp_annotate_vcf_name_annovar = (
 5631                    tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf"
 5632                )
 5633                tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err"
 5634                err_files.append(tmp_annotate_vcf_name_err)
 5635                tmp_files.append(tmp_annotate_vcf_name_err)
 5636
 5637                # Tmp file final vcf annotated by annovar
 5638                tmp_annotate_vcf = NamedTemporaryFile(
 5639                    prefix=self.get_prefix(),
 5640                    dir=self.get_tmp_dir(),
 5641                    suffix=".vcf.gz",
 5642                    delete=False,
 5643                )
 5644                tmp_annotate_vcf_name = tmp_annotate_vcf.name
 5645                tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name)
 5646                tmp_files.append(tmp_annotate_vcf_name)
 5647                tmp_files.append(tmp_annotate_vcf_name + ".tbi")
 5648
 5649                # Number of fields
 5650                annotation_list = []
 5651                annotation_renamed_list = []
 5652
 5653                for annotation_field in annotation_fields:
 5654
 5655                    # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
 5656                    annotation_fields_new_name = annotation_fields.get(
 5657                        annotation_field, annotation_field
 5658                    )
 5659                    if not annotation_fields_new_name:
 5660                        annotation_fields_new_name = annotation_field
 5661
 5662                    if (
 5663                        force_update_annotation
 5664                        or annotation_fields_new_name not in self.get_header().infos
 5665                    ):
 5666                        annotation_list.append(annotation_field)
 5667                        annotation_renamed_list.append(annotation_fields_new_name)
 5668                    else:  # annotation_fields_new_name in self.get_header().infos and not force_update_annotation:
 5669                        log.warning(
 5670                            f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
 5671                        )
 5672
 5673                    # Add rename info
 5674                    run_parallel_commands(
 5675                        [
 5676                            f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}"
 5677                        ],
 5678                        1,
 5679                    )
 5680
 5681                # log.debug("fields_to_removed: " + str(fields_to_removed))
 5682                log.debug("annotation_list: " + str(annotation_list))
 5683
 5684                # protocol
 5685                protocol = annotation
 5686
 5687                # argument
 5688                argument = ""
 5689
 5690                # operation
 5691                operation = "f"
 5692                if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith(
 5693                    "ensGene"
 5694                ):
 5695                    operation = "g"
 5696                    if options.get("genebase", None):
 5697                        argument = f"""'{options.get("genebase","")}'"""
 5698                elif annotation in ["cytoBand"]:
 5699                    operation = "r"
 5700
 5701                # argument option
 5702                argument_option = ""
 5703                if argument != "":
 5704                    argument_option = " --argument " + argument
 5705
 5706                # command options
 5707                command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """  # --intronhgvs 10
 5708                for option in options:
 5709                    if option not in ["genebase"]:
 5710                        command_options += f""" --{option}={options[option]}"""
 5711
 5712                # Command
 5713
 5714                # Command - Annovar
 5715                command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """
 5716                tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf")
 5717
 5718                # Command - start pipe
 5719                command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """
 5720
 5721                # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!)
 5722                command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """
 5723
 5724                # Command - Special characters (refGene annotation)
 5725                command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """
 5726
 5727                # Command - Clean empty fields (with value ".")
 5728                command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """
 5729
 5730                # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file
 5731                annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"]
 5732                if "ALL" not in annotation_list and "INFO" not in annotation_list:
 5733                    # for ann in annotation_renamed_list:
 5734                    for ann in annotation_list:
 5735                        annovar_fields_to_keep.append(f"^INFO/{ann}")
 5736
 5737                command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """
 5738
 5739                # Command - indexing
 5740                command_annovar += f"""  && tabix {tmp_annotate_vcf_name} """
 5741
 5742                log.debug(f"Annotation - Annovar command: {command_annovar}")
 5743                run_parallel_commands([command_annovar], 1)
 5744
 5745                # Error messages
 5746                log.info(f"Error/Warning messages:")
 5747                error_message_command_all = []
 5748                error_message_command_warning = []
 5749                error_message_command_err = []
 5750                for err_file in err_files:
 5751                    with open(err_file, "r") as f:
 5752                        for line in f:
 5753                            message = line.strip()
 5754                            error_message_command_all.append(message)
 5755                            if line.startswith("[W::") or line.startswith("WARNING"):
 5756                                error_message_command_warning.append(message)
 5757                            if line.startswith("[E::") or line.startswith("ERROR"):
 5758                                error_message_command_err.append(
 5759                                    f"{err_file}: " + message
 5760                                )
 5761                # log info
 5762                for message in list(
 5763                    set(error_message_command_err + error_message_command_warning)
 5764                ):
 5765                    log.info(f"   {message}")
 5766                # debug info
 5767                for message in list(set(error_message_command_all)):
 5768                    log.debug(f"   {message}")
 5769                # failed
 5770                if len(error_message_command_err):
 5771                    log.error("Annotation failed: Error in commands")
 5772                    raise ValueError("Annotation failed: Error in commands")
 5773
 5774            if tmp_annotates_vcf_name_list:
 5775
 5776                # List of annotated files
 5777                tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list)
 5778
 5779                # Tmp file
 5780                tmp_annotate_vcf = NamedTemporaryFile(
 5781                    prefix=self.get_prefix(),
 5782                    dir=self.get_tmp_dir(),
 5783                    suffix=".vcf.gz",
 5784                    delete=False,
 5785                )
 5786                tmp_annotate_vcf_name = tmp_annotate_vcf.name
 5787                tmp_files.append(tmp_annotate_vcf_name)
 5788                tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
 5789                err_files.append(tmp_annotate_vcf_name_err)
 5790                tmp_files.append(tmp_annotate_vcf_name_err)
 5791
 5792                # Command merge
 5793                merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} "
 5794                log.info(
 5795                    f"Annotation Annovar - Annotation merging "
 5796                    + str(len(tmp_annotates_vcf_name_list))
 5797                    + " annotated files"
 5798                )
 5799                log.debug(f"Annotation - merge command: {merge_command}")
 5800                run_parallel_commands([merge_command], 1)
 5801
 5802                # Find annotation in header
 5803                with bgzf.open(tmp_annotate_vcf_name, "rt") as f:
 5804                    header_list = self.read_vcf_header(f)
 5805                annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
 5806
 5807                for ann in annovar_vcf_header.infos:
 5808                    if ann not in self.get_header().infos:
 5809                        vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
 5810
 5811                # Update variants
 5812                log.info(f"Annotation Annovar - Updating...")
 5813                self.update_from_vcf(tmp_annotate_vcf_name)
 5814
 5815            # Clean files
 5816            # Tmp file remove command
 5817            if True:
 5818                tmp_files_remove_command = ""
 5819                if tmp_files:
 5820                    tmp_files_remove_command = " ".join(tmp_files)
 5821                clean_command = f" rm -f {tmp_files_remove_command} "
 5822                log.debug(f"Annotation Annovar - Annotation cleaning ")
 5823                log.debug(f"Annotation - cleaning command: {clean_command}")
 5824                run_parallel_commands([clean_command], 1)
 5825
 5826    # Parquet
 5827    def annotation_parquet(self, threads: int = None) -> None:
 5828        """
 5829        It takes a VCF file, and annotates it with a parquet file
 5830
 5831        :param threads: number of threads to use for the annotation
 5832        :return: the value of the variable "result".
 5833        """
 5834
 5835        # DEBUG
 5836        log.debug("Start annotation with parquet databases")
 5837
 5838        # Threads
 5839        if not threads:
 5840            threads = self.get_threads()
 5841        log.debug("Threads: " + str(threads))
 5842
 5843        # DEBUG
 5844        delete_tmp = True
 5845        if self.get_config().get("verbosity", "warning") in ["debug"]:
 5846            delete_tmp = False
 5847            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 5848
 5849        # Config
 5850        databases_folders = set(
 5851            self.get_config()
 5852            .get("folders", {})
 5853            .get("databases", {})
 5854            .get("annotations", ["."])
 5855            + self.get_config()
 5856            .get("folders", {})
 5857            .get("databases", {})
 5858            .get("parquet", ["."])
 5859        )
 5860        log.debug("Databases annotations: " + str(databases_folders))
 5861
 5862        # Param
 5863        annotations = (
 5864            self.get_param()
 5865            .get("annotation", {})
 5866            .get("parquet", {})
 5867            .get("annotations", None)
 5868        )
 5869        log.debug("Annotations: " + str(annotations))
 5870
 5871        # Assembly
 5872        assembly = self.get_param().get(
 5873            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 5874        )
 5875
 5876        # Force Update Annotation
 5877        force_update_annotation = (
 5878            self.get_param()
 5879            .get("annotation", {})
 5880            .get("options", {})
 5881            .get("annotations_update", False)
 5882        )
 5883        log.debug(f"force_update_annotation={force_update_annotation}")
 5884        force_append_annotation = (
 5885            self.get_param()
 5886            .get("annotation", {})
 5887            .get("options", {})
 5888            .get("annotations_append", False)
 5889        )
 5890        log.debug(f"force_append_annotation={force_append_annotation}")
 5891
 5892        # Data
 5893        table_variants = self.get_table_variants()
 5894
 5895        # Check if not empty
 5896        log.debug("Check if not empty")
 5897        sql_query_chromosomes_df = self.get_query_to_df(
 5898            f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1"""
 5899        )
 5900        if not sql_query_chromosomes_df["count"][0]:
 5901            log.info(f"VCF empty")
 5902            return
 5903
 5904        # VCF header
 5905        vcf_reader = self.get_header()
 5906        log.debug("Initial header: " + str(vcf_reader.infos))
 5907
 5908        # Nb Variants POS
 5909        log.debug("NB Variants Start")
 5910        nb_variants = self.conn.execute(
 5911            f"SELECT count(*) AS count FROM variants"
 5912        ).fetchdf()["count"][0]
 5913        log.debug("NB Variants Stop")
 5914
 5915        # Existing annotations
 5916        for vcf_annotation in self.get_header().infos:
 5917
 5918            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 5919            log.debug(
 5920                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 5921            )
 5922
 5923        # Added columns
 5924        added_columns = []
 5925
 5926        # drop indexes
 5927        log.debug(f"Drop indexes...")
 5928        self.drop_indexes()
 5929
 5930        if annotations:
 5931
 5932            if "ALL" in annotations:
 5933
 5934                all_param = annotations.get("ALL", {})
 5935                all_param_formats = all_param.get("formats", None)
 5936                all_param_releases = all_param.get("releases", None)
 5937
 5938                databases_infos_dict = self.scan_databases(
 5939                    database_formats=all_param_formats,
 5940                    database_releases=all_param_releases,
 5941                )
 5942                for database_infos in databases_infos_dict.keys():
 5943                    if database_infos not in annotations:
 5944                        annotations[database_infos] = {"INFO": None}
 5945
 5946            for annotation in annotations:
 5947
 5948                if annotation in ["ALL"]:
 5949                    continue
 5950
 5951                # Annotation Name
 5952                annotation_name = os.path.basename(annotation)
 5953
 5954                # Annotation fields
 5955                annotation_fields = annotations[annotation]
 5956                if not annotation_fields:
 5957                    annotation_fields = {"INFO": None}
 5958
 5959                log.debug(f"Annotation '{annotation_name}'")
 5960                log.debug(
 5961                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 5962                )
 5963
 5964                # Create Database
 5965                database = Database(
 5966                    database=annotation,
 5967                    databases_folders=databases_folders,
 5968                    assembly=assembly,
 5969                )
 5970
 5971                # Find files
 5972                parquet_file = database.get_database()
 5973                parquet_hdr_file = database.get_header_file()
 5974                parquet_type = database.get_type()
 5975
 5976                # Check if files exists
 5977                if not parquet_file or not parquet_hdr_file:
 5978                    msg_err_list = []
 5979                    if not parquet_file:
 5980                        msg_err_list.append(
 5981                            f"Annotation failed: Annotation file not found"
 5982                        )
 5983                    if parquet_file and not parquet_hdr_file:
 5984                        msg_err_list.append(
 5985                            f"Annotation failed: Annotation file '{parquet_file}' header not found. Check for file '{parquet_file}.hdr'"
 5986                        )
 5987
 5988                    log.error(". ".join(msg_err_list))
 5989                    raise ValueError(". ".join(msg_err_list))
 5990                else:
 5991                    # Get parquet connexion
 5992                    parquet_sql_attach = database.get_sql_database_attach(
 5993                        output="query"
 5994                    )
 5995                    if parquet_sql_attach:
 5996                        self.conn.execute(parquet_sql_attach)
 5997                    parquet_file_link = database.get_sql_database_link()
 5998                    # Log
 5999                    log.debug(
 6000                        f"Annotation '{annotation_name}' - file: "
 6001                        + str(parquet_file)
 6002                        + " and "
 6003                        + str(parquet_hdr_file)
 6004                    )
 6005
 6006                    # Database full header columns
 6007                    parquet_hdr_vcf_header_columns = database.get_header_file_columns(
 6008                        parquet_hdr_file
 6009                    )
 6010                    # Log
 6011                    log.debug(
 6012                        "Annotation database header columns : "
 6013                        + str(parquet_hdr_vcf_header_columns)
 6014                    )
 6015
 6016                    # Load header as VCF object
 6017                    parquet_hdr_vcf_header_infos = database.get_header().infos
 6018                    # Log
 6019                    log.debug(
 6020                        "Annotation database header: "
 6021                        + str(parquet_hdr_vcf_header_infos)
 6022                    )
 6023
 6024                    # Get extra infos
 6025                    parquet_columns = database.get_extra_columns()
 6026                    # Log
 6027                    log.debug("Annotation database Columns: " + str(parquet_columns))
 6028
 6029                    # Add extra columns if "ALL" in annotation_fields
 6030                    # if "ALL" in annotation_fields:
 6031                    #     allow_add_extra_column = True
 6032                    if "ALL" in annotation_fields and database.get_extra_columns():
 6033                        for extra_column in database.get_extra_columns():
 6034                            if (
 6035                                extra_column not in annotation_fields
 6036                                and extra_column.replace("INFO/", "")
 6037                                not in parquet_hdr_vcf_header_infos
 6038                            ):
 6039                                parquet_hdr_vcf_header_infos[extra_column] = (
 6040                                    vcf.parser._Info(
 6041                                        extra_column,
 6042                                        ".",
 6043                                        "String",
 6044                                        f"{extra_column} description",
 6045                                        "unknown",
 6046                                        "unknown",
 6047                                        self.code_type_map["String"],
 6048                                    )
 6049                                )
 6050
 6051                    # For all fields in database
 6052                    annotation_fields_all = False
 6053                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
 6054                        annotation_fields_all = True
 6055                        annotation_fields = {
 6056                            key: key for key in parquet_hdr_vcf_header_infos
 6057                        }
 6058
 6059                        log.debug(
 6060                            "Annotation database header - All annotations added: "
 6061                            + str(annotation_fields)
 6062                        )
 6063
 6064                    # Init
 6065
 6066                    # List of annotation fields to use
 6067                    sql_query_annotation_update_info_sets = []
 6068
 6069                    # List of annotation to agregate
 6070                    sql_query_annotation_to_agregate = []
 6071
 6072                    # Number of fields
 6073                    nb_annotation_field = 0
 6074
 6075                    # Annotation fields processed
 6076                    annotation_fields_processed = []
 6077
 6078                    # Columns mapping
 6079                    map_columns = database.map_columns(
 6080                        columns=annotation_fields, prefixes=["INFO/"]
 6081                    )
 6082
 6083                    # Query dict for fields to remove (update option)
 6084                    query_dict_remove = {}
 6085
 6086                    # Fetch Anotation fields
 6087                    for annotation_field in annotation_fields:
 6088
 6089                        # annotation_field_column
 6090                        annotation_field_column = map_columns.get(
 6091                            annotation_field, "INFO"
 6092                        )
 6093
 6094                        # field new name, if parametered
 6095                        annotation_fields_new_name = annotation_fields.get(
 6096                            annotation_field, annotation_field
 6097                        )
 6098                        if not annotation_fields_new_name:
 6099                            annotation_fields_new_name = annotation_field
 6100
 6101                        # To annotate
 6102                        # force_update_annotation = True
 6103                        # force_append_annotation = True
 6104                        # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)):
 6105                        if annotation_field in parquet_hdr_vcf_header_infos and (
 6106                            force_update_annotation
 6107                            or force_append_annotation
 6108                            or (
 6109                                annotation_fields_new_name
 6110                                not in self.get_header().infos
 6111                            )
 6112                        ):
 6113
 6114                            # Add field to annotation to process list
 6115                            annotation_fields_processed.append(
 6116                                annotation_fields_new_name
 6117                            )
 6118
 6119                            # explode infos for the field
 6120                            annotation_fields_new_name_info_msg = ""
 6121                            if (
 6122                                force_update_annotation
 6123                                and annotation_fields_new_name
 6124                                in self.get_header().infos
 6125                            ):
 6126                                # Remove field from INFO
 6127                                query = f"""
 6128                                    UPDATE {table_variants} as table_variants
 6129                                    SET INFO = REGEXP_REPLACE(
 6130                                                concat(table_variants.INFO,''),
 6131                                                ';*{annotation_fields_new_name}=[^;]*',
 6132                                                ''
 6133                                                )
 6134                                    WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%'
 6135                                """
 6136                                annotation_fields_new_name_info_msg = " [update]"
 6137                                query_dict_remove[
 6138                                    f"remove 'INFO/{annotation_fields_new_name}'"
 6139                                ] = query
 6140
 6141                            # Sep between fields in INFO
 6142                            nb_annotation_field += 1
 6143                            if nb_annotation_field > 1:
 6144                                annotation_field_sep = ";"
 6145                            else:
 6146                                annotation_field_sep = ""
 6147
 6148                            log.info(
 6149                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}"
 6150                            )
 6151
 6152                            # Add INFO field to header
 6153                            parquet_hdr_vcf_header_infos_number = (
 6154                                parquet_hdr_vcf_header_infos[annotation_field].num
 6155                                or "."
 6156                            )
 6157                            parquet_hdr_vcf_header_infos_type = (
 6158                                parquet_hdr_vcf_header_infos[annotation_field].type
 6159                                or "String"
 6160                            )
 6161                            parquet_hdr_vcf_header_infos_description = (
 6162                                parquet_hdr_vcf_header_infos[annotation_field].desc
 6163                                or f"{annotation_field} description"
 6164                            )
 6165                            parquet_hdr_vcf_header_infos_source = (
 6166                                parquet_hdr_vcf_header_infos[annotation_field].source
 6167                                or "unknown"
 6168                            )
 6169                            parquet_hdr_vcf_header_infos_version = (
 6170                                parquet_hdr_vcf_header_infos[annotation_field].version
 6171                                or "unknown"
 6172                            )
 6173
 6174                            vcf_reader.infos[annotation_fields_new_name] = (
 6175                                vcf.parser._Info(
 6176                                    annotation_fields_new_name,
 6177                                    parquet_hdr_vcf_header_infos_number,
 6178                                    parquet_hdr_vcf_header_infos_type,
 6179                                    parquet_hdr_vcf_header_infos_description,
 6180                                    parquet_hdr_vcf_header_infos_source,
 6181                                    parquet_hdr_vcf_header_infos_version,
 6182                                    self.code_type_map[
 6183                                        parquet_hdr_vcf_header_infos_type
 6184                                    ],
 6185                                )
 6186                            )
 6187
 6188                            # Append
 6189                            if force_append_annotation:
 6190                                query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """
 6191                            else:
 6192                                query_case_when_append = ""
 6193
 6194                            # Annotation/Update query fields
 6195                            # Found in INFO column
 6196                            if (
 6197                                annotation_field_column == "INFO"
 6198                                and "INFO" in parquet_hdr_vcf_header_columns
 6199                            ):
 6200                                sql_query_annotation_update_info_sets.append(
 6201                                    f"""
 6202                                CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append}
 6203                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1))
 6204                                        ELSE ''
 6205                                    END
 6206                                """
 6207                                )
 6208                            # Found in a specific column
 6209                            else:
 6210                                sql_query_annotation_update_info_sets.append(
 6211                                    f"""
 6212                                CASE WHEN CAST(table_parquet."{annotation_field_column}" AS VARCHAR) NOT IN ('','.') {query_case_when_append}
 6213                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(CAST(table_parquet."{annotation_field_column}" AS VARCHAR), ';', ','))
 6214                                        ELSE ''
 6215                                    END
 6216                                """
 6217                                )
 6218                                sql_query_annotation_to_agregate.append(
 6219                                    f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """
 6220                                )
 6221
 6222                        # Not to annotate
 6223                        else:
 6224
 6225                            if force_update_annotation:
 6226                                annotation_message = "forced"
 6227                            else:
 6228                                annotation_message = "skipped"
 6229
 6230                            if annotation_field not in parquet_hdr_vcf_header_infos:
 6231                                log.warning(
 6232                                    f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file"
 6233                                )
 6234                            if annotation_fields_new_name in self.get_header().infos:
 6235                                log.warning(
 6236                                    f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})"
 6237                                )
 6238
 6239                    # Check if ALL fields have to be annotated. Thus concat all INFO field
 6240                    # allow_annotation_full_info = True
 6241                    allow_annotation_full_info = not force_append_annotation
 6242
 6243                    if parquet_type in ["regions"]:
 6244                        allow_annotation_full_info = False
 6245
 6246                    if (
 6247                        allow_annotation_full_info
 6248                        and nb_annotation_field == len(annotation_fields)
 6249                        and annotation_fields_all
 6250                        and (
 6251                            "INFO" in parquet_hdr_vcf_header_columns
 6252                            and "INFO" in database.get_extra_columns()
 6253                        )
 6254                    ):
 6255                        log.debug("Column INFO annotation enabled")
 6256                        sql_query_annotation_update_info_sets = []
 6257                        sql_query_annotation_update_info_sets.append(
 6258                            f" table_parquet.INFO "
 6259                        )
 6260
 6261                    if sql_query_annotation_update_info_sets:
 6262
 6263                        # Annotate
 6264                        log.info(f"Annotation '{annotation_name}' - Annotation...")
 6265
 6266                        # Join query annotation update info sets for SQL
 6267                        sql_query_annotation_update_info_sets_sql = ",".join(
 6268                            sql_query_annotation_update_info_sets
 6269                        )
 6270
 6271                        # Check chromosomes list (and variants infos)
 6272                        sql_query_chromosomes = f"""
 6273                            SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants
 6274                            FROM {table_variants} as table_variants
 6275                            GROUP BY table_variants."#CHROM"
 6276                            ORDER BY table_variants."#CHROM"
 6277                            """
 6278                        sql_query_chromosomes_df = self.conn.execute(
 6279                            sql_query_chromosomes
 6280                        ).df()
 6281                        sql_query_chromosomes_dict = {
 6282                            entry["CHROM"]: {
 6283                                "count": entry["count_variants"],
 6284                                "min": entry["min_variants"],
 6285                                "max": entry["max_variants"],
 6286                            }
 6287                            for index, entry in sql_query_chromosomes_df.iterrows()
 6288                        }
 6289
 6290                        # Init
 6291                        nb_of_query = 0
 6292                        nb_of_variant_annotated = 0
 6293                        query_dict = query_dict_remove
 6294
 6295                        # for chrom in sql_query_chromosomes_df["CHROM"]:
 6296                        for chrom in sql_query_chromosomes_dict:
 6297
 6298                            # Number of variant by chromosome
 6299                            nb_of_variant_by_chrom = sql_query_chromosomes_dict.get(
 6300                                chrom, {}
 6301                            ).get("count", 0)
 6302
 6303                            log.debug(
 6304                                f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..."
 6305                            )
 6306
 6307                            # Annotation with regions database
 6308                            if parquet_type in ["regions"]:
 6309                                sql_query_annotation_from_clause = f"""
 6310                                    FROM (
 6311                                        SELECT 
 6312                                            '{chrom}' AS \"#CHROM\",
 6313                                            table_variants_from.\"POS\" AS \"POS\",
 6314                                            {",".join(sql_query_annotation_to_agregate)}
 6315                                        FROM {table_variants} as table_variants_from
 6316                                        LEFT JOIN {parquet_file_link} as table_parquet_from ON (
 6317                                            table_parquet_from."#CHROM" = '{chrom}'
 6318                                            AND table_variants_from.\"POS\" <= table_parquet_from.\"END\"
 6319                                            AND table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1)
 6320                                        )
 6321                                        WHERE table_variants_from.\"#CHROM\" in ('{chrom}')
 6322                                        GROUP BY table_variants_from.\"POS\"
 6323                                        )
 6324                                        as table_parquet
 6325                                """
 6326
 6327                                sql_query_annotation_where_clause = """
 6328                                    table_parquet.\"#CHROM\" = table_variants.\"#CHROM\"
 6329                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
 6330                                """
 6331
 6332                            # Annotation with variants database
 6333                            else:
 6334                                sql_query_annotation_from_clause = f"""
 6335                                    FROM {parquet_file_link} as table_parquet
 6336                                """
 6337                                sql_query_annotation_where_clause = f"""
 6338                                    table_variants."#CHROM" = '{chrom}'
 6339                                    AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 
 6340                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
 6341                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
 6342                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
 6343                                """
 6344
 6345                            # Create update query
 6346                            sql_query_annotation_chrom_interval_pos = f"""
 6347                                UPDATE {table_variants} as table_variants
 6348                                    SET INFO = 
 6349                                        concat(
 6350                                            CASE WHEN table_variants.INFO NOT IN ('','.')
 6351                                                THEN table_variants.INFO
 6352                                                ELSE ''
 6353                                            END
 6354                                            ,
 6355                                            CASE WHEN table_variants.INFO NOT IN ('','.')
 6356                                                        AND (
 6357                                                        concat({sql_query_annotation_update_info_sets_sql})
 6358                                                        )
 6359                                                        NOT IN ('','.') 
 6360                                                    THEN ';'
 6361                                                    ELSE ''
 6362                                            END
 6363                                            ,
 6364                                            {sql_query_annotation_update_info_sets_sql}
 6365                                            )
 6366                                    {sql_query_annotation_from_clause}
 6367                                    WHERE {sql_query_annotation_where_clause}
 6368                                    ;
 6369                                """
 6370
 6371                            # Add update query to dict
 6372                            query_dict[
 6373                                f"{chrom} [{nb_of_variant_by_chrom} variants]"
 6374                            ] = sql_query_annotation_chrom_interval_pos
 6375
 6376                        nb_of_query = len(query_dict)
 6377                        num_query = 0
 6378
 6379                        # SET max_expression_depth TO x
 6380                        self.conn.execute("SET max_expression_depth TO 10000")
 6381
 6382                        for query_name in query_dict:
 6383                            query = query_dict[query_name]
 6384                            num_query += 1
 6385                            log.info(
 6386                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..."
 6387                            )
 6388                            result = self.conn.execute(query)
 6389                            nb_of_variant_annotated_by_query = result.df()["Count"][0]
 6390                            nb_of_variant_annotated += nb_of_variant_annotated_by_query
 6391                            log.info(
 6392                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated"
 6393                            )
 6394
 6395                        log.info(
 6396                            f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)"
 6397                        )
 6398
 6399                    else:
 6400
 6401                        log.info(
 6402                            f"Annotation '{annotation_name}' - No Annotations available"
 6403                        )
 6404
 6405                    log.debug("Final header: " + str(vcf_reader.infos))
 6406
 6407        # Remove added columns
 6408        for added_column in added_columns:
 6409            self.drop_column(column=added_column)
 6410
 6411    def annotation_splice(self, threads: int = None) -> None:
 6412        """
 6413        This function annotate with snpEff
 6414
 6415        :param threads: The number of threads to use
 6416        :return: the value of the variable "return_value".
 6417        """
 6418
 6419        # DEBUG
 6420        log.debug("Start annotation with splice tools")
 6421
 6422        # Threads
 6423        if not threads:
 6424            threads = self.get_threads()
 6425        log.debug("Threads: " + str(threads))
 6426
 6427        # DEBUG
 6428        delete_tmp = True
 6429        if self.get_config().get("verbosity", "warning") in ["debug"]:
 6430            delete_tmp = False
 6431            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 6432
 6433        # Config
 6434        config = self.get_config()
 6435        log.debug("Config: " + str(config))
 6436        splice_config = config.get("tools", {}).get("splice", {})
 6437        if not splice_config:
 6438            splice_config = DEFAULT_TOOLS_BIN.get("splice", {})
 6439            msg_err = "No Splice tool config"
 6440            raise ValueError(msg_err)
 6441        log.debug(f"splice_config: {splice_config}")
 6442
 6443        # Config - Folders - Databases
 6444        databases_folders = (
 6445            config.get("folders", {}).get("databases", {}).get("splice", ["."])
 6446        )
 6447        log.debug("Databases annotations: " + str(databases_folders))
 6448
 6449        # Splice docker image
 6450        splice_docker_image = splice_config.get("docker").get("image")
 6451
 6452        # Pull splice image if it's not already there
 6453        if not check_docker_image_exists(splice_docker_image):
 6454            log.warning(
 6455                f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub"
 6456            )
 6457            try:
 6458                command(f"docker pull {splice_config.get('docker').get('image')}")
 6459            except subprocess.CalledProcessError:
 6460                msg_err = f"Unable to find docker {splice_docker_image} on dockerhub"
 6461                log.error(msg_err)
 6462                raise ValueError(msg_err)
 6463
 6464        # Config - splice databases
 6465        splice_databases = (
 6466            config.get("folders", {})
 6467            .get("databases", {})
 6468            .get("splice", DEFAULT_SPLICE_FOLDER)
 6469        )
 6470        splice_databases = full_path(splice_databases)
 6471
 6472        # Param
 6473        param = self.get_param()
 6474        log.debug("Param: " + str(param))
 6475
 6476        # Param
 6477        options = param.get("annotation", {}).get("splice", {}).get("options", {})
 6478        log.debug("Options: " + str(options))
 6479
 6480        # Data
 6481        table_variants = self.get_table_variants()
 6482
 6483        # Check if not empty
 6484        log.debug("Check if not empty")
 6485        sql_query_chromosomes = (
 6486            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 6487        )
 6488        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
 6489            log.info("VCF empty")
 6490            return None
 6491
 6492        # Export in VCF
 6493        log.debug("Create initial file to annotate")
 6494
 6495        # Create output folder / work folder
 6496        if options.get("output_folder", ""):
 6497            output_folder = options.get("output_folder", "")
 6498            if not os.path.exists(output_folder):
 6499                Path(output_folder).mkdir(parents=True, exist_ok=True)
 6500        else:
 6501            output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}")
 6502            if not os.path.exists(output_folder):
 6503                Path(output_folder).mkdir(parents=True, exist_ok=True)
 6504
 6505        if options.get("workdir", ""):
 6506            workdir = options.get("workdir", "")
 6507        else:
 6508            workdir = "/work"
 6509
 6510        # Create tmp VCF file
 6511        tmp_vcf = NamedTemporaryFile(
 6512            prefix=self.get_prefix(),
 6513            dir=output_folder,
 6514            suffix=".vcf",
 6515            delete=False,
 6516        )
 6517        tmp_vcf_name = tmp_vcf.name
 6518
 6519        # VCF header
 6520        header = self.get_header()
 6521
 6522        # Existing annotations
 6523        for vcf_annotation in self.get_header().infos:
 6524
 6525            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 6526            log.debug(
 6527                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 6528            )
 6529
 6530        # Memory limit
 6531        if config.get("memory", None):
 6532            memory_limit = config.get("memory", "8G").upper()
 6533            # upper()
 6534        else:
 6535            memory_limit = "8G"
 6536        log.debug(f"memory_limit: {memory_limit}")
 6537
 6538        # Check number of variants to annotate
 6539        where_clause_regex_spliceai = r"SpliceAI_\w+"
 6540        where_clause_regex_spip = r"SPiP_\w+"
 6541        where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')"""
 6542        df_list_of_variants_to_annotate = self.get_query_to_df(
 6543            query=f""" SELECT * FROM variants {where_clause} """
 6544        )
 6545        if len(df_list_of_variants_to_annotate) == 0:
 6546            log.warning(
 6547                f"No variants to annotate with splice. Variants probably already annotated with splice"
 6548            )
 6549            return None
 6550        else:
 6551            log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants")
 6552
 6553        # Export VCF file
 6554        self.export_variant_vcf(
 6555            vcf_file=tmp_vcf_name,
 6556            remove_info=True,
 6557            add_samples=True,
 6558            index=False,
 6559            where_clause=where_clause,
 6560        )
 6561        mount = [f" -v {path}:{path}:rw" for path in [output_folder]]
 6562        if any(value for value in splice_config.values() if value is None):
 6563            log.warning("At least one splice config parameter is empty")
 6564            # exit annotation_splice
 6565            return None
 6566
 6567        # Params in splice nf
 6568        def check_values(dico: dict):
 6569            """
 6570            Ensure parameters for NF splice pipeline
 6571            """
 6572            for key, val in dico.items():
 6573                if key == "genome":
 6574                    if any(
 6575                        assemb in options.get("genome", {})
 6576                        for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"]
 6577                    ):
 6578                        yield f"--{key} hg19"
 6579                    elif any(
 6580                        assemb in options.get("genome", {})
 6581                        for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"]
 6582                    ):
 6583                        yield f"--{key} hg38"
 6584                elif (
 6585                    (isinstance(val, str) and val)
 6586                    or isinstance(val, int)
 6587                    or isinstance(val, bool)
 6588                ):
 6589                    yield f"--{key} {val}"
 6590
 6591        # Genome
 6592        genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY))
 6593        options["genome"] = genome
 6594        # NF params
 6595        nf_params = []
 6596        # Add options
 6597        if options:
 6598            log.debug(options)
 6599            nf_params = list(check_values(options))
 6600            log.debug(f"Splice NF params: {' '.join(nf_params)}")
 6601        else:
 6602            log.debug("No NF params provided")
 6603        # Add threads
 6604        if "threads" not in options.keys():
 6605            nf_params.append(f"--threads {threads}")
 6606        # Genome path
 6607        genome_path = find_genome(
 6608            config.get("folders", {})
 6609            .get("databases", {})
 6610            .get("genomes", DEFAULT_GENOME_FOLDER),
 6611            file=f"{genome}.fa",
 6612        )
 6613        # Add genome path
 6614        if not genome_path:
 6615            raise ValueError(
 6616                f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}"
 6617            )
 6618        else:
 6619            log.debug(f"Genome: {genome_path}")
 6620            nf_params.append(f"--genome_path {genome_path}")
 6621
 6622        def splice_annotations(options: dict = {}, config: dict = {}) -> list:
 6623            """
 6624            Setting up updated databases for SPiP and SpliceAI
 6625            """
 6626
 6627            try:
 6628
 6629                # SpliceAI assembly transcriptome
 6630                spliceai_assembly = os.path.join(
 6631                    config.get("folders", {}).get("databases", {}).get("spliceai", {}),
 6632                    options.get("genome"),
 6633                    "transcriptome",
 6634                )
 6635                spip_assembly = options.get("genome")
 6636
 6637                spip = find(
 6638                    f"transcriptome_{spip_assembly}.RData",
 6639                    config.get("folders", {}).get("databases", {}).get("spip", {}),
 6640                )
 6641                spliceai = find("spliceai.refseq.txt", spliceai_assembly)
 6642                log.debug(f"SPiP annotations: {spip}")
 6643                log.debug(f"SpliceAI annotations: {spliceai}")
 6644                if spip and spliceai:
 6645                    return [
 6646                        f"--spip_transcriptome {spip}",
 6647                        f"--spliceai_transcriptome {spliceai}",
 6648                    ]
 6649                else:
 6650                    log.warning(
 6651                        "Can't find splice databases in configuration, use annotations file from image"
 6652                    )
 6653            except TypeError:
 6654                log.warning(
 6655                    "Can't find splice databases in configuration, use annotations file from image"
 6656                )
 6657                return []
 6658
 6659        # Add options, check if transcriptome option have already beend provided
 6660        if (
 6661            "spip_transcriptome" not in nf_params
 6662            and "spliceai_transcriptome" not in nf_params
 6663        ):
 6664            splice_reference = splice_annotations(options, config)
 6665            if splice_reference:
 6666                nf_params.extend(splice_reference)
 6667        # nf_params.append(f"--output_folder {output_folder}")
 6668        random_uuid = f"HOWARD-SPLICE-{get_random()}"
 6669        cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline"
 6670        log.debug(cmd)
 6671        splice_config["docker"]["command"] = cmd
 6672
 6673        # Ensure proxy is set
 6674        proxy = [
 6675            f"-e {var}={os.getenv(var)}"
 6676            for var in ["https_proxy", "http_proxy", "ftp_proxy"]
 6677            if os.getenv(var) is not None
 6678        ]
 6679        docker_cmd = get_bin_command(
 6680            tool="splice",
 6681            bin_type="docker",
 6682            config=config,
 6683            default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker",
 6684            add_options=f"--name {random_uuid} {' '.join(mount)} -e NXF_DISABLE_CHECK_LATEST=true {' '.join(proxy)}",
 6685        )
 6686        # print(docker_cmd)
 6687        # exit()
 6688        # Docker debug
 6689        # if splice_config.get("rm_container"):
 6690        #     rm_container = "--rm"
 6691        # else:
 6692        #     rm_container = ""
 6693        # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}"
 6694        log.debug(docker_cmd)
 6695        res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True)
 6696        log.debug(res.stdout)
 6697        if res.stderr:
 6698            log.error(res.stderr)
 6699        res.check_returncode()
 6700        # Update variants
 6701        log.info("Annotation - Updating...")
 6702        # Test find output vcf
 6703        log.debug(
 6704            f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
 6705        )
 6706        output_vcf = []
 6707        # Wrong folder to look in
 6708        for files in os.listdir(os.path.dirname(tmp_vcf_name)):
 6709            if (
 6710                files
 6711                == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
 6712            ):
 6713                output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files))
 6714        # log.debug(os.listdir(options.get("output_folder")))
 6715        log.debug(f"Splice annotated vcf: {output_vcf[0]}")
 6716        if not output_vcf:
 6717            log.debug(
 6718                f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz"
 6719            )
 6720        else:
 6721            # Get new header from annotated vcf
 6722            log.debug(f"Initial header: {len(header.infos)} fields")
 6723            # Create new header with splice infos
 6724            new_vcf = Variants(input=output_vcf[0])
 6725            new_vcf_header = new_vcf.get_header().infos
 6726            for keys, infos in new_vcf_header.items():
 6727                if keys not in header.infos.keys():
 6728                    header.infos[keys] = infos
 6729            log.debug(f"New header: {len(header.infos)} fields")
 6730            log.debug(f"Splice tmp output: {output_vcf[0]}")
 6731            self.update_from_vcf(output_vcf[0])
 6732
 6733        # Remove file
 6734        remove_if_exists(output_vcf)
 6735
 6736    ###
 6737    # Prioritization
 6738    ###
 6739
 6740    def get_config_default(self, name: str) -> dict:
 6741        """
 6742        The function `get_config_default` returns a dictionary containing default configurations for
 6743        various calculations and prioritizations.
 6744
 6745        :param name: The `get_config_default` function returns a dictionary containing default
 6746        configurations for different calculations and prioritizations. The `name` parameter is used to
 6747        specify which specific configuration to retrieve from the dictionary
 6748        :type name: str
 6749        :return: The function `get_config_default` returns a dictionary containing default configuration
 6750        settings for different calculations and prioritizations. The specific configuration settings are
 6751        retrieved based on the input `name` parameter provided to the function. If the `name` parameter
 6752        matches a key in the `config_default` dictionary, the corresponding configuration settings are
 6753        returned. If there is no match, an empty dictionary is returned.
 6754        """
 6755
 6756        config_default = {
 6757            "calculations": {
 6758                "variant_chr_pos_alt_ref": {
 6759                    "type": "sql",
 6760                    "name": "variant_chr_pos_alt_ref",
 6761                    "description": "Create a variant ID with chromosome, position, alt and ref",
 6762                    "available": False,
 6763                    "output_column_name": "variant_chr_pos_alt_ref",
 6764                    "output_column_type": "String",
 6765                    "output_column_description": "variant ID with chromosome, position, alt and ref",
 6766                    "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """,
 6767                    "operation_info": True,
 6768                },
 6769                "VARTYPE": {
 6770                    "type": "sql",
 6771                    "name": "VARTYPE",
 6772                    "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)",
 6773                    "available": True,
 6774                    "table": "variants",
 6775                    "output_column_name": "VARTYPE",
 6776                    "output_column_type": "String",
 6777                    "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ",
 6778                    "operation_query": """
 6779                            CASE
 6780                                WHEN "SVTYPE" NOT NULL THEN "SVTYPE"
 6781                                WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV'
 6782                                WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC'
 6783                                WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV'
 6784                                WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL'
 6785                                ELSE 'UNDEFINED'
 6786                            END
 6787                            """,
 6788                    "info_fields": ["SVTYPE"],
 6789                    "operation_info": True,
 6790                },
 6791                "snpeff_hgvs": {
 6792                    "type": "python",
 6793                    "name": "snpeff_hgvs",
 6794                    "description": "HGVS nomenclatures from snpEff annotation",
 6795                    "available": True,
 6796                    "function_name": "calculation_extract_snpeff_hgvs",
 6797                    "function_params": ["snpeff_hgvs", "ANN"],
 6798                },
 6799                "snpeff_ann_explode": {
 6800                    "type": "python",
 6801                    "name": "snpeff_ann_explode",
 6802                    "description": "Explode snpEff annotations with uniquify values",
 6803                    "available": True,
 6804                    "function_name": "calculation_snpeff_ann_explode",
 6805                    "function_params": [False, "fields", "snpeff_", "ANN"],
 6806                },
 6807                "snpeff_ann_explode_uniquify": {
 6808                    "type": "python",
 6809                    "name": "snpeff_ann_explode_uniquify",
 6810                    "description": "Explode snpEff annotations",
 6811                    "available": True,
 6812                    "function_name": "calculation_snpeff_ann_explode",
 6813                    "function_params": [True, "fields", "snpeff_uniquify_", "ANN"],
 6814                },
 6815                "snpeff_ann_explode_json": {
 6816                    "type": "python",
 6817                    "name": "snpeff_ann_explode_json",
 6818                    "description": "Explode snpEff annotations in JSON format",
 6819                    "available": True,
 6820                    "function_name": "calculation_snpeff_ann_explode",
 6821                    "function_params": [False, "JSON", "snpeff_json", "ANN"],
 6822                },
 6823                "NOMEN": {
 6824                    "type": "python",
 6825                    "name": "NOMEN",
 6826                    "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field (see parameters help)",
 6827                    "available": True,
 6828                    "function_name": "calculation_extract_nomen",
 6829                    "function_params": [],
 6830                },
 6831                "FINDBYPIPELINE": {
 6832                    "type": "python",
 6833                    "name": "FINDBYPIPELINE",
 6834                    "description": "Number of pipeline that identify the variant (for multi pipeline VCF)",
 6835                    "available": True,
 6836                    "function_name": "calculation_find_by_pipeline",
 6837                    "function_params": ["findbypipeline"],
 6838                },
 6839                "FINDBYSAMPLE": {
 6840                    "type": "python",
 6841                    "name": "FINDBYSAMPLE",
 6842                    "description": "Number of sample that have a genotype for the variant (for multi sample VCF)",
 6843                    "available": True,
 6844                    "function_name": "calculation_find_by_pipeline",
 6845                    "function_params": ["findbysample"],
 6846                },
 6847                "GENOTYPECONCORDANCE": {
 6848                    "type": "python",
 6849                    "name": "GENOTYPECONCORDANCE",
 6850                    "description": "Concordance of genotype for multi caller VCF",
 6851                    "available": True,
 6852                    "function_name": "calculation_genotype_concordance",
 6853                    "function_params": [],
 6854                },
 6855                "BARCODE": {
 6856                    "type": "python",
 6857                    "name": "BARCODE",
 6858                    "description": "BARCODE as VaRank tool",
 6859                    "available": True,
 6860                    "function_name": "calculation_barcode",
 6861                    "function_params": [],
 6862                },
 6863                "BARCODEFAMILY": {
 6864                    "type": "python",
 6865                    "name": "BARCODEFAMILY",
 6866                    "description": "BARCODEFAMILY as VaRank tool",
 6867                    "available": True,
 6868                    "function_name": "calculation_barcode_family",
 6869                    "function_params": ["BCF"],
 6870                },
 6871                "TRIO": {
 6872                    "type": "python",
 6873                    "name": "TRIO",
 6874                    "description": "Inheritance for a trio family",
 6875                    "available": True,
 6876                    "function_name": "calculation_trio",
 6877                    "function_params": [],
 6878                },
 6879                "VAF": {
 6880                    "type": "python",
 6881                    "name": "VAF",
 6882                    "description": "Variant Allele Frequency (VAF) harmonization",
 6883                    "available": True,
 6884                    "function_name": "calculation_vaf_normalization",
 6885                    "function_params": [],
 6886                },
 6887                "VAF_stats": {
 6888                    "type": "python",
 6889                    "name": "VAF_stats",
 6890                    "description": "Variant Allele Frequency (VAF) statistics",
 6891                    "available": True,
 6892                    "function_name": "calculation_genotype_stats",
 6893                    "function_params": ["VAF"],
 6894                },
 6895                "DP_stats": {
 6896                    "type": "python",
 6897                    "name": "DP_stats",
 6898                    "description": "Depth (DP) statistics",
 6899                    "available": True,
 6900                    "function_name": "calculation_genotype_stats",
 6901                    "function_params": ["DP"],
 6902                },
 6903                "variant_id": {
 6904                    "type": "python",
 6905                    "name": "variant_id",
 6906                    "description": "Variant ID generated from variant position and type",
 6907                    "available": True,
 6908                    "function_name": "calculation_variant_id",
 6909                    "function_params": [],
 6910                },
 6911                "transcripts_json": {
 6912                    "type": "python",
 6913                    "name": "transcripts_json",
 6914                    "description": "Add transcripts annotations in JSON format (field 'transcripts_json')",
 6915                    "available": True,
 6916                    "function_name": "calculation_transcripts_annotation",
 6917                    "function_params": ["transcripts_json", None],
 6918                },
 6919                "transcripts_ann": {
 6920                    "type": "python",
 6921                    "name": "transcripts_ann",
 6922                    "description": "Add transcripts annotations in structured format (field 'transcripts_ann')",
 6923                    "available": True,
 6924                    "function_name": "calculation_transcripts_annotation",
 6925                    "function_params": [None, "transcripts_ann"],
 6926                },
 6927                "transcripts_annotations": {
 6928                    "type": "python",
 6929                    "name": "transcripts_annotations",
 6930                    "description": "Add transcripts annotations in JSON and/or structured format (see param JSON file)",
 6931                    "available": True,
 6932                    "function_name": "calculation_transcripts_annotation",
 6933                    "function_params": [None, None],
 6934                },
 6935                "transcripts_prioritization": {
 6936                    "type": "python",
 6937                    "name": "transcripts_prioritization",
 6938                    "description": "Prioritize transcripts with a prioritization profile (using param.json)",
 6939                    "available": True,
 6940                    "function_name": "calculation_transcripts_prioritization",
 6941                    "function_params": [],
 6942                },
 6943                "transcripts_export": {
 6944                    "type": "python",
 6945                    "name": "transcripts_export",
 6946                    "description": "Export transcripts table/view as a file (using param.json)",
 6947                    "available": True,
 6948                    "function_name": "calculation_transcripts_export",
 6949                    "function_params": [],
 6950                },
 6951            },
 6952            "prioritizations": {
 6953                "default": {
 6954                    "ANN2": [
 6955                        {
 6956                            "type": "contains",
 6957                            "value": "HIGH",
 6958                            "score": 5,
 6959                            "flag": "PASS",
 6960                            "comment": [
 6961                                "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay"
 6962                            ],
 6963                        },
 6964                        {
 6965                            "type": "contains",
 6966                            "value": "MODERATE",
 6967                            "score": 3,
 6968                            "flag": "PASS",
 6969                            "comment": [
 6970                                "A non-disruptive variant that might change protein effectiveness"
 6971                            ],
 6972                        },
 6973                        {
 6974                            "type": "contains",
 6975                            "value": "LOW",
 6976                            "score": 0,
 6977                            "flag": "FILTERED",
 6978                            "comment": [
 6979                                "Assumed to be mostly harmless or unlikely to change protein behavior"
 6980                            ],
 6981                        },
 6982                        {
 6983                            "type": "contains",
 6984                            "value": "MODIFIER",
 6985                            "score": 0,
 6986                            "flag": "FILTERED",
 6987                            "comment": [
 6988                                "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact"
 6989                            ],
 6990                        },
 6991                    ],
 6992                }
 6993            },
 6994        }
 6995
 6996        return config_default.get(name, None)
 6997
 6998    def get_config_json(
 6999        self, name: str, config_dict: dict = {}, config_file: str = None
 7000    ) -> dict:
 7001        """
 7002        The function `get_config_json` retrieves a configuration JSON object with prioritizations from
 7003        default values, a dictionary, and a file.
 7004
 7005        :param name: The `name` parameter in the `get_config_json` function is a string that represents
 7006        the name of the configuration. It is used to identify and retrieve the configuration settings
 7007        for a specific component or module
 7008        :type name: str
 7009        :param config_dict: The `config_dict` parameter in the `get_config_json` function is a
 7010        dictionary that allows you to provide additional configuration settings or overrides. When you
 7011        call the `get_config_json` function, you can pass a dictionary containing key-value pairs where
 7012        the key is the configuration setting you want to override or
 7013        :type config_dict: dict
 7014        :param config_file: The `config_file` parameter in the `get_config_json` function is used to
 7015        specify the path to a configuration file that contains additional settings. If provided, the
 7016        function will read the contents of this file and update the configuration dictionary with the
 7017        values found in the file, overriding any existing values with the
 7018        :type config_file: str
 7019        :return: The function `get_config_json` returns a dictionary containing the configuration
 7020        settings.
 7021        """
 7022
 7023        # Create with default prioritizations
 7024        config_default = self.get_config_default(name=name)
 7025        configuration = config_default
 7026        # log.debug(f"configuration={configuration}")
 7027
 7028        # Replace prioritizations from dict
 7029        for config in config_dict:
 7030            configuration[config] = config_dict[config]
 7031
 7032        # Replace prioritizations from file
 7033        config_file = full_path(config_file)
 7034        if config_file:
 7035            if os.path.exists(config_file):
 7036                with open(config_file) as config_file_content:
 7037                    config_file_dict = json.load(config_file_content)
 7038                for config in config_file_dict:
 7039                    configuration[config] = config_file_dict[config]
 7040            else:
 7041                msg_error = f"Config '{name}' file '{config_file}' does NOT exist"
 7042                log.error(msg_error)
 7043                raise ValueError(msg_error)
 7044
 7045        return configuration
 7046
 7047    def prioritization(
 7048        self, table: str = None, pz_prefix: str = None, pz_param: dict = None
 7049    ) -> bool:
 7050        """
 7051        The `prioritization` function in Python processes VCF files, adds new INFO fields, and
 7052        prioritizes variants based on configured profiles and criteria.
 7053
 7054        :param table: The `table` parameter in the `prioritization` function is used to specify the name
 7055        of the table (presumably a VCF file) on which the prioritization operation will be performed. If
 7056        a table name is provided, the method will prioritize the variants in that specific table
 7057        :type table: str
 7058        :param pz_prefix: The `pz_prefix` parameter is used to specify a prefix that will be added to
 7059        certain INFO fields in a VCF file during the prioritization process. If this parameter is not
 7060        provided, the code will use a default prefix value of "PZ"
 7061        :type pz_prefix: str
 7062        :param pz_param: The `pz_param` parameter in the `prioritization` method is used to pass
 7063        additional parameters specific to the prioritization process. These parameters can include
 7064        settings related to prioritization profiles, fields, scoring modes, flags, comments, and other
 7065        configurations needed for the prioritization of variants in a V
 7066        :type pz_param: dict
 7067        :return: A boolean value (True) is being returned from the `prioritization` function.
 7068        """
 7069
 7070        # Config
 7071        config = self.get_config()
 7072
 7073        # Param
 7074        param = self.get_param()
 7075
 7076        # Prioritization param
 7077        if pz_param is not None:
 7078            prioritization_param = pz_param
 7079        else:
 7080            prioritization_param = param.get("prioritization", {})
 7081
 7082        # Configuration profiles
 7083        prioritization_config_file = prioritization_param.get(
 7084            "prioritization_config", None
 7085        )
 7086        prioritization_config_file = full_path(prioritization_config_file)
 7087        prioritizations_config = self.get_config_json(
 7088            name="prioritizations", config_file=prioritization_config_file
 7089        )
 7090
 7091        # Prioritization prefix
 7092        pz_prefix_default = "PZ"
 7093        if pz_prefix is None:
 7094            pz_prefix = prioritization_param.get("pzprefix", pz_prefix_default)
 7095
 7096        # Prioritization options
 7097        profiles = prioritization_param.get("profiles", [])
 7098        if isinstance(profiles, str):
 7099            profiles = profiles.split(",")
 7100        pzfields = prioritization_param.get(
 7101            "pzfields", [f"{pz_prefix}Flag", f"{pz_prefix}Score"]
 7102        )
 7103        if isinstance(pzfields, str):
 7104            pzfields = pzfields.split(",")
 7105        default_profile = prioritization_param.get("default_profile", None)
 7106        pzfields_sep = prioritization_param.get("pzfields_sep", "_")
 7107        prioritization_score_mode = prioritization_param.get(
 7108            "prioritization_score_mode", "HOWARD"
 7109        )
 7110
 7111        # Quick Prioritizations
 7112        prioritizations = param.get("prioritizations", None)
 7113        if prioritizations:
 7114            log.info("Quick Prioritization:")
 7115            for profile in prioritizations.split(","):
 7116                if profile not in profiles:
 7117                    profiles.append(profile)
 7118                    log.info(f"   {profile}")
 7119
 7120        # If profile "ALL" provided, all profiles in the config profiles
 7121        if "ALL" in profiles:
 7122            profiles = list(prioritizations_config.keys())
 7123
 7124        for profile in profiles:
 7125            if prioritizations_config.get(profile, None):
 7126                log.debug(f"Profile '{profile}' configured")
 7127            else:
 7128                msg_error = f"Profile '{profile}' NOT configured"
 7129                log.error(msg_error)
 7130                raise ValueError(msg_error)
 7131
 7132        if profiles:
 7133            log.info(f"Prioritization... ")
 7134        else:
 7135            log.debug(f"No profile defined")
 7136            return False
 7137
 7138        if not default_profile and len(profiles):
 7139            default_profile = profiles[0]
 7140
 7141        log.debug("Profiles availables: " + str(list(prioritizations_config.keys())))
 7142        log.debug("Profiles to check: " + str(list(profiles)))
 7143
 7144        # Variables
 7145        if table is not None:
 7146            table_variants = table
 7147        else:
 7148            table_variants = self.get_table_variants(clause="update")
 7149        log.debug(f"Table to prioritize: {table_variants}")
 7150
 7151        # Added columns
 7152        added_columns = []
 7153
 7154        # Create list of PZfields
 7155        # List of PZFields
 7156        list_of_pzfields_original = pzfields + [
 7157            pzfield + pzfields_sep + profile
 7158            for pzfield in pzfields
 7159            for profile in profiles
 7160        ]
 7161        list_of_pzfields = []
 7162        log.debug(f"{list_of_pzfields_original}")
 7163
 7164        # Remove existing PZfields to use if exists
 7165        for pzfield in list_of_pzfields_original:
 7166            if self.get_header().infos.get(pzfield, None) is None:
 7167                list_of_pzfields.append(pzfield)
 7168                log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF")
 7169            else:
 7170                log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF")
 7171
 7172        if list_of_pzfields:
 7173
 7174            # Explode Infos prefix
 7175            explode_infos_prefix = self.get_explode_infos_prefix()
 7176
 7177            # PZfields tags description
 7178            PZfields_INFOS = {
 7179                f"{pz_prefix}Tags": {
 7180                    "ID": f"{pz_prefix}Tags",
 7181                    "Number": ".",
 7182                    "Type": "String",
 7183                    "Description": "Variant tags based on annotation criteria",
 7184                },
 7185                f"{pz_prefix}Score": {
 7186                    "ID": f"{pz_prefix}Score",
 7187                    "Number": 1,
 7188                    "Type": "Integer",
 7189                    "Description": "Variant score based on annotation criteria",
 7190                },
 7191                f"{pz_prefix}Flag": {
 7192                    "ID": f"{pz_prefix}Flag",
 7193                    "Number": 1,
 7194                    "Type": "String",
 7195                    "Description": "Variant flag based on annotation criteria",
 7196                },
 7197                f"{pz_prefix}Comment": {
 7198                    "ID": f"{pz_prefix}Comment",
 7199                    "Number": ".",
 7200                    "Type": "String",
 7201                    "Description": "Variant comment based on annotation criteria",
 7202                },
 7203                f"{pz_prefix}Infos": {
 7204                    "ID": f"{pz_prefix}Infos",
 7205                    "Number": ".",
 7206                    "Type": "String",
 7207                    "Description": "Variant infos based on annotation criteria",
 7208                },
 7209                f"{pz_prefix}Class": {
 7210                    "ID": f"{pz_prefix}Class",
 7211                    "Number": ".",
 7212                    "Type": "String",
 7213                    "Description": "Variant class based on annotation criteria",
 7214                },
 7215            }
 7216
 7217            # Create INFO fields if not exist
 7218            for field in PZfields_INFOS:
 7219                field_ID = PZfields_INFOS[field]["ID"]
 7220                field_description = PZfields_INFOS[field]["Description"]
 7221                if field_ID not in self.get_header().infos and field_ID in pzfields:
 7222                    field_description = (
 7223                        PZfields_INFOS[field]["Description"]
 7224                        + f", profile {default_profile}"
 7225                    )
 7226                    self.get_header().infos[field_ID] = vcf.parser._Info(
 7227                        field_ID,
 7228                        PZfields_INFOS[field]["Number"],
 7229                        PZfields_INFOS[field]["Type"],
 7230                        field_description,
 7231                        "unknown",
 7232                        "unknown",
 7233                        code_type_map[PZfields_INFOS[field]["Type"]],
 7234                    )
 7235
 7236            # Create INFO fields if not exist for each profile
 7237            for profile in prioritizations_config:
 7238                if profile in profiles or profiles == []:
 7239                    for field in PZfields_INFOS:
 7240                        field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile
 7241                        field_description = (
 7242                            PZfields_INFOS[field]["Description"]
 7243                            + f", profile {profile}"
 7244                        )
 7245                        if (
 7246                            field_ID not in self.get_header().infos
 7247                            and field in pzfields
 7248                        ):
 7249                            self.get_header().infos[field_ID] = vcf.parser._Info(
 7250                                field_ID,
 7251                                PZfields_INFOS[field]["Number"],
 7252                                PZfields_INFOS[field]["Type"],
 7253                                field_description,
 7254                                "unknown",
 7255                                "unknown",
 7256                                code_type_map[PZfields_INFOS[field]["Type"]],
 7257                            )
 7258
 7259            # Header
 7260            for pzfield in list_of_pzfields:
 7261                if re.match(f"{pz_prefix}Score.*", pzfield):
 7262                    added_column = self.add_column(
 7263                        table_name=table_variants,
 7264                        column_name=pzfield,
 7265                        column_type="INTEGER",
 7266                        default_value="0",
 7267                    )
 7268                elif re.match(f"{pz_prefix}Flag.*", pzfield):
 7269                    added_column = self.add_column(
 7270                        table_name=table_variants,
 7271                        column_name=pzfield,
 7272                        column_type="BOOLEAN",
 7273                        default_value="1",
 7274                    )
 7275                elif re.match(f"{pz_prefix}Class.*", pzfield):
 7276                    added_column = self.add_column(
 7277                        table_name=table_variants,
 7278                        column_name=pzfield,
 7279                        column_type="VARCHAR[]",
 7280                        default_value="null",
 7281                    )
 7282                else:
 7283                    added_column = self.add_column(
 7284                        table_name=table_variants,
 7285                        column_name=pzfield,
 7286                        column_type="STRING",
 7287                        default_value="''",
 7288                    )
 7289                added_columns.append(added_column)
 7290
 7291            # Profiles
 7292            if profiles:
 7293
 7294                # foreach profile in configuration file
 7295                for profile in prioritizations_config:
 7296
 7297                    # If profile is asked in param, or ALL are asked (empty profile [])
 7298                    if profile in profiles or profiles == []:
 7299                        log.info(f"Profile '{profile}'")
 7300
 7301                        sql_set_info_option = ""
 7302
 7303                        sql_set_info = []
 7304
 7305                        # PZ fields set
 7306
 7307                        # PZScore
 7308                        if (
 7309                            f"{pz_prefix}Score{pzfields_sep}{profile}"
 7310                            in list_of_pzfields
 7311                        ):
 7312                            sql_set_info.append(
 7313                                f"""
 7314                                    concat(
 7315                                        '{pz_prefix}Score{pzfields_sep}{profile}=',
 7316                                        {pz_prefix}Score{pzfields_sep}{profile}
 7317                                    ) 
 7318                                """
 7319                            )
 7320                            if (
 7321                                profile == default_profile
 7322                                and f"{pz_prefix}Score" in list_of_pzfields
 7323                            ):
 7324                                sql_set_info.append(
 7325                                    f"""
 7326                                        concat(
 7327                                            '{pz_prefix}Score=',
 7328                                            {pz_prefix}Score{pzfields_sep}{profile}
 7329                                        )
 7330                                    """
 7331                                )
 7332
 7333                        # PZFlag
 7334                        if (
 7335                            f"{pz_prefix}Flag{pzfields_sep}{profile}"
 7336                            in list_of_pzfields
 7337                        ):
 7338                            sql_set_info.append(
 7339                                f"""
 7340                                    concat(
 7341                                        '{pz_prefix}Flag{pzfields_sep}{profile}=',
 7342                                        CASE 
 7343                                            WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1
 7344                                            THEN 'PASS'
 7345                                            WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0
 7346                                            THEN 'FILTERED'
 7347                                        END
 7348                                    ) 
 7349                                """
 7350                            )
 7351                            if (
 7352                                profile == default_profile
 7353                                and f"{pz_prefix}Flag" in list_of_pzfields
 7354                            ):
 7355                                sql_set_info.append(
 7356                                    f"""
 7357                                        concat(
 7358                                            '{pz_prefix}Flag=',
 7359                                            CASE 
 7360                                                WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1
 7361                                                THEN 'PASS'
 7362                                                WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0
 7363                                                THEN 'FILTERED'
 7364                                            END
 7365                                        )
 7366                                    """
 7367                                )
 7368
 7369                        # PZClass
 7370                        if (
 7371                            f"{pz_prefix}Class{pzfields_sep}{profile}"
 7372                            in list_of_pzfields
 7373                        ):
 7374                            sql_set_info.append(
 7375                                f"""
 7376                                    concat(
 7377                                        '{pz_prefix}Class{pzfields_sep}{profile}=',
 7378                                        CASE
 7379                                            WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
 7380                                            THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
 7381                                            ELSE '.'
 7382                                        END 
 7383                                    )
 7384                                    
 7385                                """
 7386                            )
 7387                            if (
 7388                                profile == default_profile
 7389                                and f"{pz_prefix}Class" in list_of_pzfields
 7390                            ):
 7391                                sql_set_info.append(
 7392                                    f"""
 7393                                        concat(
 7394                                            '{pz_prefix}Class=',
 7395                                            CASE
 7396                                                WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
 7397                                                THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
 7398                                                ELSE '.'
 7399                                            END 
 7400                                        )
 7401                                    """
 7402                                )
 7403
 7404                        # PZComment
 7405                        if (
 7406                            f"{pz_prefix}Comment{pzfields_sep}{profile}"
 7407                            in list_of_pzfields
 7408                        ):
 7409                            sql_set_info.append(
 7410                                f"""
 7411                                    CASE
 7412                                        WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('')
 7413                                        THEN concat('{pz_prefix}Comment{pzfields_sep}{profile}=', {pz_prefix}Comment{pzfields_sep}{profile})
 7414                                        ELSE ''
 7415                                    END
 7416                                """
 7417                            )
 7418                            if (
 7419                                profile == default_profile
 7420                                and f"{pz_prefix}Comment" in list_of_pzfields
 7421                            ):
 7422                                sql_set_info.append(
 7423                                    f"""
 7424                                        CASE
 7425                                            WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('')
 7426                                            THEN concat('{pz_prefix}Comment=', {pz_prefix}Comment{pzfields_sep}{profile})
 7427                                            ELSE ''
 7428                                        END
 7429                                    """
 7430                                )
 7431
 7432                        # PZInfos
 7433                        if (
 7434                            f"{pz_prefix}Infos{pzfields_sep}{profile}"
 7435                            in list_of_pzfields
 7436                        ):
 7437                            sql_set_info.append(
 7438                                f"""
 7439                                    CASE
 7440                                        WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('')
 7441                                        THEN concat('{pz_prefix}Infos{pzfields_sep}{profile}=', {pz_prefix}Infos{pzfields_sep}{profile})
 7442                                        ELSE ''
 7443                                    END
 7444                                """
 7445                            )
 7446                            if (
 7447                                profile == default_profile
 7448                                and f"{pz_prefix}Infos" in list_of_pzfields
 7449                            ):
 7450                                sql_set_info.append(
 7451                                    f"""
 7452                                        CASE
 7453                                            WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('')
 7454                                            THEN concat('{pz_prefix}Infos=', {pz_prefix}Infos{pzfields_sep}{profile})
 7455                                            ELSE ''
 7456                                        END
 7457                                    """
 7458                                )
 7459
 7460                        # Merge PZfields
 7461                        sql_set_info_option = ""
 7462                        sql_set_sep = ""
 7463                        for sql_set in sql_set_info:
 7464                            if sql_set_sep:
 7465                                sql_set_info_option += f"""
 7466                                    , concat('{sql_set_sep}', {sql_set})
 7467                                """
 7468                            else:
 7469                                sql_set_info_option += f"""
 7470                                    , {sql_set}
 7471                                """
 7472                            sql_set_sep = ";"
 7473
 7474                        sql_queries = []
 7475                        for annotation in prioritizations_config[profile]:
 7476
 7477                            # skip special sections
 7478                            if annotation.startswith("_"):
 7479                                continue
 7480
 7481                            # For each criterions
 7482                            for criterion in prioritizations_config[profile][
 7483                                annotation
 7484                            ]:
 7485
 7486                                # Criterion mode
 7487                                criterion_mode = None
 7488                                if np.any(
 7489                                    np.isin(list(criterion.keys()), ["type", "value"])
 7490                                ):
 7491                                    criterion_mode = "operation"
 7492                                elif np.any(
 7493                                    np.isin(list(criterion.keys()), ["sql", "fields"])
 7494                                ):
 7495                                    criterion_mode = "sql"
 7496                                log.debug(f"Criterion Mode: {criterion_mode}")
 7497
 7498                                # Criterion parameters
 7499                                criterion_type = criterion.get("type", None)
 7500                                criterion_value = criterion.get("value", None)
 7501                                criterion_sql = criterion.get("sql", None)
 7502                                criterion_fields = criterion.get("fields", None)
 7503                                criterion_score = criterion.get("score", 0)
 7504                                criterion_flag = criterion.get("flag", "PASS")
 7505                                criterion_class = criterion.get("class", None)
 7506                                criterion_flag_bool = criterion_flag == "PASS"
 7507                                criterion_comment = (
 7508                                    ", ".join(criterion.get("comment", []))
 7509                                    .replace("'", "''")
 7510                                    .replace(";", ",")
 7511                                    .replace("\t", " ")
 7512                                )
 7513                                criterion_infos = (
 7514                                    str(criterion)
 7515                                    .replace("'", "''")
 7516                                    .replace(";", ",")
 7517                                    .replace("\t", " ")
 7518                                )
 7519
 7520                                # SQL
 7521                                if criterion_sql is not None and isinstance(
 7522                                    criterion_sql, list
 7523                                ):
 7524                                    criterion_sql = " ".join(criterion_sql)
 7525
 7526                                # Fields and explode
 7527                                if criterion_fields is None:
 7528                                    criterion_fields = [annotation]
 7529                                if not isinstance(criterion_fields, list):
 7530                                    criterion_fields = str(criterion_fields).split(",")
 7531
 7532                                # Class
 7533                                if criterion_class is not None and not isinstance(
 7534                                    criterion_class, list
 7535                                ):
 7536                                    criterion_class = str(criterion_class).split(",")
 7537
 7538                                for annotation_field in criterion_fields:
 7539
 7540                                    # Explode specific annotation
 7541                                    log.debug(
 7542                                        f"Explode annotation '{annotation_field}'"
 7543                                    )
 7544                                    added_columns += self.explode_infos(
 7545                                        prefix=explode_infos_prefix,
 7546                                        fields=[annotation_field],
 7547                                        table=table_variants,
 7548                                    )
 7549                                    extra_infos = self.get_extra_infos(
 7550                                        table=table_variants
 7551                                    )
 7552
 7553                                    # Check if annotation field is present
 7554                                    if (
 7555                                        f"{explode_infos_prefix}{annotation_field}"
 7556                                        not in extra_infos
 7557                                    ):
 7558                                        msq_err = f"Annotation '{annotation_field}' not in data"
 7559                                        log.error(msq_err)
 7560                                        raise ValueError(msq_err)
 7561                                    else:
 7562                                        log.debug(
 7563                                            f"Annotation '{annotation_field}' in data"
 7564                                        )
 7565
 7566                                sql_set = []
 7567                                sql_set_info = []
 7568
 7569                                # PZ fields set
 7570
 7571                                # PZScore
 7572                                if (
 7573                                    f"{pz_prefix}Score{pzfields_sep}{profile}"
 7574                                    in list_of_pzfields
 7575                                ):
 7576                                    # if prioritization_score_mode == "HOWARD":
 7577                                    #     sql_set.append(
 7578                                    #         f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}"
 7579                                    #     )
 7580                                    # VaRank prioritization score mode
 7581                                    if prioritization_score_mode == "VaRank":
 7582                                        sql_set.append(
 7583                                            f"{pz_prefix}Score{pzfields_sep}{profile} = CASE WHEN {criterion_score}>{pz_prefix}Score{pzfields_sep}{profile} THEN {criterion_score} END"
 7584                                        )
 7585                                    # default HOWARD prioritization score mode
 7586                                    else:
 7587                                        sql_set.append(
 7588                                            f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}"
 7589                                        )
 7590
 7591                                # PZFlag
 7592                                if (
 7593                                    f"{pz_prefix}Flag{pzfields_sep}{profile}"
 7594                                    in list_of_pzfields
 7595                                ):
 7596                                    sql_set.append(
 7597                                        f"{pz_prefix}Flag{pzfields_sep}{profile} = {pz_prefix}Flag{pzfields_sep}{profile} AND {criterion_flag_bool}"
 7598                                    )
 7599
 7600                                # PZClass
 7601                                if (
 7602                                    f"{pz_prefix}Class{pzfields_sep}{profile}"
 7603                                    in list_of_pzfields
 7604                                    and criterion_class is not None
 7605                                ):
 7606                                    sql_set.append(
 7607                                        f" {pz_prefix}Class{pzfields_sep}{profile} = list_concat(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), {criterion_class}) "
 7608                                    )
 7609
 7610                                # PZComment
 7611                                if (
 7612                                    f"{pz_prefix}Comment{pzfields_sep}{profile}"
 7613                                    in list_of_pzfields
 7614                                ):
 7615                                    sql_set.append(
 7616                                        f"""
 7617                                            {pz_prefix}Comment{pzfields_sep}{profile} = 
 7618                                                concat(
 7619                                                    {pz_prefix}Comment{pzfields_sep}{profile},
 7620                                                    CASE 
 7621                                                        WHEN {pz_prefix}Comment{pzfields_sep}{profile}!=''
 7622                                                        THEN ', '
 7623                                                        ELSE ''
 7624                                                    END,
 7625                                                    '{criterion_comment}'
 7626                                                )
 7627                                        """
 7628                                    )
 7629
 7630                                # PZInfos
 7631                                if (
 7632                                    f"{pz_prefix}Infos{pzfields_sep}{profile}"
 7633                                    in list_of_pzfields
 7634                                ):
 7635                                    sql_set.append(
 7636                                        f"""
 7637                                            {pz_prefix}Infos{pzfields_sep}{profile} = 
 7638                                                concat(
 7639                                                    {pz_prefix}Infos{pzfields_sep}{profile},
 7640                                                    '{criterion_infos}'
 7641                                                )
 7642                                        """
 7643                                    )
 7644                                sql_set_option = ",".join(sql_set)
 7645
 7646                                # Criterion and comparison
 7647                                if sql_set_option:
 7648
 7649                                    if criterion_mode in ["operation"]:
 7650
 7651                                        try:
 7652                                            float(criterion_value)
 7653                                            sql_update = f"""
 7654                                                UPDATE {table_variants}
 7655                                                SET {sql_set_option}
 7656                                                WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.')
 7657                                                AND CAST("{explode_infos_prefix}{annotation}" AS FLOAT){comparison_map[criterion_type]}{criterion_value}
 7658                                            """
 7659                                        except:
 7660                                            contains_option = ""
 7661                                            if criterion_type == "contains":
 7662                                                contains_option = ".*"
 7663                                            sql_update = f"""
 7664                                                UPDATE {table_variants}
 7665                                                SET {sql_set_option}
 7666                                                WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}'
 7667                                            """
 7668                                        sql_queries.append(sql_update)
 7669
 7670                                    elif criterion_mode in ["sql"]:
 7671
 7672                                        sql_update = f"""
 7673                                            UPDATE {table_variants}
 7674                                            SET {sql_set_option}
 7675                                            WHERE {criterion_sql}
 7676                                        """
 7677                                        sql_queries.append(sql_update)
 7678
 7679                                    else:
 7680                                        msg_err = f"Prioritization criterion mode failed (either 'operation' or 'sql')"
 7681                                        log.error(msg_err)
 7682                                        raise ValueError(msg_err)
 7683
 7684                                else:
 7685                                    log.warning(
 7686                                        f"NO SQL SET option for '{annotation}' - '{criterion}'"
 7687                                    )
 7688
 7689                        # PZTags
 7690                        if (
 7691                            f"{pz_prefix}Tags{pzfields_sep}{profile}"
 7692                            in list_of_pzfields
 7693                        ):
 7694
 7695                            # Create PZFalgs value
 7696                            pztags_value = ""
 7697                            pztags_sep_default = ","
 7698                            pztags_sep = ""
 7699                            for pzfield in pzfields:
 7700                                if pzfield not in [f"{pz_prefix}Tags"]:
 7701                                    if (
 7702                                        f"{pzfield}{pzfields_sep}{profile}"
 7703                                        in list_of_pzfields
 7704                                    ):
 7705                                        if pzfield in [f"{pz_prefix}Flag"]:
 7706                                            pztags_value += f"""{pztags_sep}{pzfield}#', 
 7707                                                CASE WHEN {pz_prefix}Flag{pzfields_sep}{profile}
 7708                                                    THEN 'PASS'
 7709                                                    ELSE 'FILTERED'
 7710                                                END, '"""
 7711                                        elif pzfield in [f"{pz_prefix}Class"]:
 7712                                            pztags_value += f"""{pztags_sep}{pzfield}#', 
 7713                                                CASE WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
 7714                                                    THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
 7715                                                    ELSE '.'
 7716                                                END, '"""
 7717                                        else:
 7718                                            pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '"
 7719                                        pztags_sep = pztags_sep_default
 7720
 7721                            # Add Query update for PZFlags
 7722                            sql_update_pztags = f"""
 7723                                UPDATE {table_variants}
 7724                                SET INFO = concat(
 7725                                        INFO,
 7726                                        CASE WHEN INFO NOT in ('','.')
 7727                                                THEN ';'
 7728                                                ELSE ''
 7729                                        END,
 7730                                        '{pz_prefix}Tags{pzfields_sep}{profile}={pztags_value}'
 7731                                    )
 7732                                """
 7733                            sql_queries.append(sql_update_pztags)
 7734
 7735                            # Add Query update for PZFlags for default
 7736                            if profile == default_profile:
 7737                                sql_update_pztags_default = f"""
 7738                                UPDATE {table_variants}
 7739                                SET INFO = concat(
 7740                                        INFO,
 7741                                        ';',
 7742                                        '{pz_prefix}Tags={pztags_value}'
 7743                                    )
 7744                                """
 7745                                sql_queries.append(sql_update_pztags_default)
 7746
 7747                        log.info(f"""Profile '{profile}' - Prioritization... """)
 7748
 7749                        if sql_queries:
 7750
 7751                            for sql_query in sql_queries:
 7752                                log.debug(
 7753                                    f"""Profile '{profile}' - Prioritization query: {sql_query}... """
 7754                                )
 7755                                self.conn.execute(sql_query)
 7756
 7757                        log.info(f"""Profile '{profile}' - Update... """)
 7758                        sql_query_update = f"""
 7759                            UPDATE {table_variants}
 7760                            SET INFO =  
 7761                                concat(
 7762                                    CASE
 7763                                        WHEN INFO NOT IN ('','.')
 7764                                        THEN concat(INFO, ';')
 7765                                        ELSE ''
 7766                                    END
 7767                                    {sql_set_info_option}
 7768                                )
 7769                        """
 7770                        self.conn.execute(sql_query_update)
 7771
 7772        else:
 7773
 7774            log.warning(f"No profiles in parameters")
 7775
 7776        # Remove added columns
 7777        for added_column in added_columns:
 7778            self.drop_column(column=added_column)
 7779
 7780        # Explode INFOS fields into table fields
 7781        if self.get_explode_infos():
 7782            self.explode_infos(
 7783                prefix=self.get_explode_infos_prefix(),
 7784                fields=self.get_explode_infos_fields(),
 7785                force=True,
 7786            )
 7787
 7788        return True
 7789
 7790    ###
 7791    # HGVS
 7792    ###
 7793
 7794    def annotation_hgvs(self, threads: int = None) -> None:
 7795        """
 7796        The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic
 7797        coordinates and alleles.
 7798
 7799        :param threads: The `threads` parameter is an optional integer that specifies the number of
 7800        threads to use for parallel processing. If no value is provided, it will default to the number
 7801        of threads obtained from the `get_threads()` method
 7802        :type threads: int
 7803        """
 7804
 7805        # Function for each partition of the Dask Dataframe
 7806        def partition_function(partition):
 7807            """
 7808            The function `partition_function` applies the `annotation_hgvs_partition` function to
 7809            each row of a DataFrame called `partition`.
 7810
 7811            :param partition: The parameter "partition" is a pandas DataFrame that contains the data
 7812            to be processed
 7813            :return: the result of applying the "annotation_hgvs_partition" function to each row of
 7814            the "partition" dataframe along the axis 1.
 7815            """
 7816            return partition.apply(annotation_hgvs_partition, axis=1)
 7817
 7818        def annotation_hgvs_partition(row) -> str:
 7819            """
 7820            The function `annotation_hgvs_partition` takes in a row of data and returns a string
 7821            containing a list of HGVS names associated with the given genomic coordinates and alleles.
 7822
 7823            :param row: A dictionary-like object that contains the values for the following keys:
 7824            :return: a string that contains the HGVS names associated with the given row of data.
 7825            """
 7826
 7827            chr = row["CHROM"]
 7828            pos = row["POS"]
 7829            ref = row["REF"]
 7830            alt = row["ALT"]
 7831
 7832            # Find list of associated transcripts
 7833            transcripts_list = list(
 7834                polars_conn.execute(
 7835                    f"""
 7836                SELECT transcript
 7837                FROM refseq_df
 7838                WHERE CHROM='{chr}'
 7839                AND POS={pos}
 7840            """
 7841                )["transcript"]
 7842            )
 7843
 7844            # Full HGVS annotation in list
 7845            hgvs_full_list = []
 7846
 7847            for transcript_name in transcripts_list:
 7848
 7849                # Transcript
 7850                transcript = get_transcript(
 7851                    transcripts=transcripts, transcript_name=transcript_name
 7852                )
 7853                # Exon
 7854                if use_exon:
 7855                    exon = transcript.find_exon_number(pos)
 7856                else:
 7857                    exon = None
 7858                # Protein
 7859                transcript_protein = None
 7860                if use_protein or add_protein or full_format:
 7861                    transcripts_protein = list(
 7862                        polars_conn.execute(
 7863                            f"""
 7864                        SELECT protein
 7865                        FROM refseqlink_df
 7866                        WHERE transcript='{transcript_name}'
 7867                        LIMIT 1
 7868                    """
 7869                        )["protein"]
 7870                    )
 7871                    if len(transcripts_protein):
 7872                        transcript_protein = transcripts_protein[0]
 7873
 7874                # HGVS name
 7875                hgvs_name = format_hgvs_name(
 7876                    chr,
 7877                    pos,
 7878                    ref,
 7879                    alt,
 7880                    genome=genome,
 7881                    transcript=transcript,
 7882                    transcript_protein=transcript_protein,
 7883                    exon=exon,
 7884                    use_gene=use_gene,
 7885                    use_protein=use_protein,
 7886                    full_format=full_format,
 7887                    use_version=use_version,
 7888                    codon_type=codon_type,
 7889                )
 7890                hgvs_full_list.append(hgvs_name)
 7891                if add_protein and not use_protein and not full_format:
 7892                    hgvs_name = format_hgvs_name(
 7893                        chr,
 7894                        pos,
 7895                        ref,
 7896                        alt,
 7897                        genome=genome,
 7898                        transcript=transcript,
 7899                        transcript_protein=transcript_protein,
 7900                        exon=exon,
 7901                        use_gene=use_gene,
 7902                        use_protein=True,
 7903                        full_format=False,
 7904                        use_version=use_version,
 7905                        codon_type=codon_type,
 7906                    )
 7907                    hgvs_full_list.append(hgvs_name)
 7908
 7909            # Create liste of HGVS annotations
 7910            hgvs_full = ",".join(hgvs_full_list)
 7911
 7912            return hgvs_full
 7913
 7914        # Polars connexion
 7915        polars_conn = pl.SQLContext(register_globals=True, eager=True)
 7916
 7917        # Config
 7918        config = self.get_config()
 7919
 7920        # Databases
 7921        # Genome
 7922        databases_genomes_folders = (
 7923            config.get("folders", {})
 7924            .get("databases", {})
 7925            .get("genomes", DEFAULT_GENOME_FOLDER)
 7926        )
 7927        databases_genome = (
 7928            config.get("folders", {}).get("databases", {}).get("genomes", "")
 7929        )
 7930        # refseq database folder
 7931        databases_refseq_folders = (
 7932            config.get("folders", {})
 7933            .get("databases", {})
 7934            .get("refseq", DEFAULT_REFSEQ_FOLDER)
 7935        )
 7936        # refseq
 7937        databases_refseq = config.get("databases", {}).get("refSeq", None)
 7938        # refSeqLink
 7939        databases_refseqlink = config.get("databases", {}).get("refSeqLink", None)
 7940
 7941        # Param
 7942        param = self.get_param()
 7943
 7944        # Quick HGVS
 7945        if "hgvs_options" in param and param.get("hgvs_options", ""):
 7946            log.info(f"Quick HGVS Annotation:")
 7947            if not param.get("hgvs", None):
 7948                param["hgvs"] = {}
 7949            for option in param.get("hgvs_options", "").split(","):
 7950                option_var_val = option.split("=")
 7951                option_var = option_var_val[0]
 7952                if len(option_var_val) > 1:
 7953                    option_val = option_var_val[1]
 7954                else:
 7955                    option_val = "True"
 7956                if option_val.upper() in ["TRUE"]:
 7957                    option_val = True
 7958                elif option_val.upper() in ["FALSE"]:
 7959                    option_val = False
 7960                log.info(f"   {option_var}={option_val}")
 7961                param["hgvs"][option_var] = option_val
 7962
 7963        # Check if HGVS annotation enabled
 7964        if "hgvs" in param:
 7965            log.info(f"HGVS Annotation... ")
 7966            for hgvs_option in param.get("hgvs", {}):
 7967                log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}")
 7968        else:
 7969            return
 7970
 7971        # HGVS Param
 7972        param_hgvs = param.get("hgvs", {})
 7973        use_exon = param_hgvs.get("use_exon", False)
 7974        use_gene = param_hgvs.get("use_gene", False)
 7975        use_protein = param_hgvs.get("use_protein", False)
 7976        add_protein = param_hgvs.get("add_protein", False)
 7977        full_format = param_hgvs.get("full_format", False)
 7978        use_version = param_hgvs.get("use_version", False)
 7979        codon_type = param_hgvs.get("codon_type", "3")
 7980
 7981        # refSseq refSeqLink
 7982        databases_refseq = param_hgvs.get("refseq", databases_refseq)
 7983        databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink)
 7984
 7985        # Assembly
 7986        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 7987
 7988        # Genome
 7989        genome_file = None
 7990        if find_genome(databases_genome):
 7991            genome_file = find_genome(databases_genome)
 7992        else:
 7993            genome_file = find_genome(
 7994                genome_path=databases_genomes_folders, assembly=assembly
 7995            )
 7996        log.debug("Genome: " + str(genome_file))
 7997
 7998        # refSseq
 7999        refseq_file = find_file_prefix(
 8000            input_file=databases_refseq,
 8001            prefix="ncbiRefSeq",
 8002            folder=databases_refseq_folders,
 8003            assembly=assembly,
 8004        )
 8005        log.debug("refSeq: " + str(refseq_file))
 8006
 8007        # refSeqLink
 8008        refseqlink_file = find_file_prefix(
 8009            input_file=databases_refseqlink,
 8010            prefix="ncbiRefSeqLink",
 8011            folder=databases_refseq_folders,
 8012            assembly=assembly,
 8013        )
 8014        log.debug("refSeqLink: " + str(refseqlink_file))
 8015
 8016        # Threads
 8017        if not threads:
 8018            threads = self.get_threads()
 8019        log.debug("Threads: " + str(threads))
 8020
 8021        # Variables
 8022        table_variants = self.get_table_variants(clause="update")
 8023
 8024        # Get variants SNV and InDel only
 8025        query_variants = f"""
 8026            SELECT "#CHROM" AS CHROM, POS, REF, ALT
 8027            FROM {table_variants}
 8028            WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$'
 8029            """
 8030        df_variants = self.get_query_to_df(query_variants)
 8031
 8032        # Added columns
 8033        added_columns = []
 8034
 8035        # Add hgvs column in variants table
 8036        hgvs_column_name = "hgvs_" + str(random.randrange(1000))
 8037        added_column = self.add_column(
 8038            table_variants, hgvs_column_name, "STRING", default_value=None
 8039        )
 8040        added_columns.append(added_column)
 8041
 8042        log.debug(f"refSeq loading...")
 8043        # refSeq in duckDB
 8044        refseq_table = get_refseq_table(
 8045            conn=self.conn, refseq_table="refseq", refseq_file=refseq_file
 8046        )
 8047        # Loading all refSeq in Dataframe
 8048        refseq_query = f"""
 8049            SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript
 8050            FROM {refseq_table}
 8051            JOIN df_variants ON (
 8052                {refseq_table}.chrom = df_variants.CHROM
 8053                AND {refseq_table}.txStart<=df_variants.POS
 8054                AND {refseq_table}.txEnd>=df_variants.POS
 8055            )
 8056        """
 8057        refseq_df = self.conn.query(refseq_query).pl()
 8058
 8059        if refseqlink_file:
 8060            log.debug(f"refSeqLink loading...")
 8061            # refSeqLink in duckDB
 8062            refseqlink_table = get_refseq_table(
 8063                conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file
 8064            )
 8065            # Loading all refSeqLink in Dataframe
 8066            protacc_column = "protAcc_with_ver"
 8067            mrnaacc_column = "mrnaAcc_with_ver"
 8068            refseqlink_query = f"""
 8069                SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript
 8070                FROM {refseqlink_table} 
 8071                JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver)
 8072                WHERE protAcc_without_ver IS NOT NULL
 8073            """
 8074            # Polars Dataframe
 8075            refseqlink_df = self.conn.query(f"{refseqlink_query}").pl()
 8076
 8077        # Read RefSeq transcripts into a python dict/model.
 8078        log.debug(f"Transcripts loading...")
 8079        with tempfile.TemporaryDirectory() as tmpdir:
 8080            transcripts_query = f"""
 8081                COPY (
 8082                    SELECT {refseq_table}.*
 8083                    FROM {refseq_table}
 8084                    JOIN df_variants ON (
 8085                        {refseq_table}.chrom=df_variants.CHROM
 8086                        AND {refseq_table}.txStart<=df_variants.POS
 8087                        AND {refseq_table}.txEnd>=df_variants.POS
 8088                    )
 8089                )
 8090                TO '{tmpdir}/transcript.tsv' (DELIMITER '\t');
 8091            """
 8092            self.conn.query(transcripts_query)
 8093            with open(f"{tmpdir}/transcript.tsv") as infile:
 8094                transcripts = read_transcripts(infile)
 8095
 8096        # Polars connexion
 8097        polars_conn = pl.SQLContext(register_globals=True, eager=True)
 8098
 8099        log.debug("Genome loading...")
 8100        # Read genome sequence using pyfaidx.
 8101        genome = Fasta(genome_file)
 8102
 8103        log.debug("Start annotation HGVS...")
 8104
 8105        # Create
 8106        # a Dask Dataframe from Pandas dataframe with partition as number of threads
 8107        ddf = dd.from_pandas(df_variants, npartitions=threads)
 8108
 8109        # Use dask.dataframe.apply() to apply function on each partition
 8110        ddf[hgvs_column_name] = ddf.map_partitions(partition_function)
 8111
 8112        # Convert Dask DataFrame to Pandas Dataframe
 8113        df = ddf.compute()
 8114
 8115        # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???)
 8116        with tempfile.TemporaryDirectory() as tmpdir:
 8117            df_parquet = os.path.join(tmpdir, "df.parquet")
 8118            df.to_parquet(df_parquet)
 8119
 8120            # Update hgvs column
 8121            update_variant_query = f"""
 8122                UPDATE {table_variants}
 8123                SET "{hgvs_column_name}"=df."{hgvs_column_name}"
 8124                FROM read_parquet('{df_parquet}') as df
 8125                WHERE variants."#CHROM" = df.CHROM
 8126                AND variants.POS = df.POS
 8127                AND variants.REF = df.REF
 8128                AND variants.ALT = df.ALT
 8129                AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL
 8130                """
 8131            self.execute_query(update_variant_query)
 8132
 8133        # Update INFO column
 8134        sql_query_update = f"""
 8135            UPDATE {table_variants}
 8136            SET INFO = 
 8137                concat(
 8138                    CASE 
 8139                        WHEN INFO NOT IN ('','.')
 8140                        THEN concat(INFO, ';')
 8141                        ELSE ''
 8142                    END,
 8143                    'hgvs=',
 8144                    {hgvs_column_name}
 8145                )
 8146            WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL
 8147            """
 8148        self.execute_query(sql_query_update)
 8149
 8150        # Add header
 8151        HGVS_INFOS = {
 8152            "hgvs": {
 8153                "ID": "hgvs",
 8154                "Number": ".",
 8155                "Type": "String",
 8156                "Description": f"HGVS annotatation with HOWARD",
 8157            }
 8158        }
 8159
 8160        for field in HGVS_INFOS:
 8161            field_ID = HGVS_INFOS[field]["ID"]
 8162            field_description = HGVS_INFOS[field]["Description"]
 8163            self.get_header().infos[field_ID] = vcf.parser._Info(
 8164                field_ID,
 8165                HGVS_INFOS[field]["Number"],
 8166                HGVS_INFOS[field]["Type"],
 8167                field_description,
 8168                "unknown",
 8169                "unknown",
 8170                code_type_map[HGVS_INFOS[field]["Type"]],
 8171            )
 8172
 8173        # Remove added columns
 8174        for added_column in added_columns:
 8175            self.drop_column(column=added_column)
 8176
 8177    ###
 8178    # Calculation
 8179    ###
 8180
 8181    def get_operations_help(
 8182        self, operations_config_dict: dict = {}, operations_config_file: str = None
 8183    ) -> list:
 8184
 8185        # Init
 8186        operations_help = []
 8187
 8188        # operations
 8189        operations = self.get_config_json(
 8190            name="calculations",
 8191            config_dict=operations_config_dict,
 8192            config_file=operations_config_file,
 8193        )
 8194        for op in operations:
 8195            op_name = operations[op].get("name", op).upper()
 8196            op_description = operations[op].get("description", op_name)
 8197            op_available = operations[op].get("available", False)
 8198            if op_available:
 8199                operations_help.append(f"   {op_name}: {op_description}")
 8200
 8201        # Sort operations
 8202        operations_help.sort()
 8203
 8204        # insert header
 8205        operations_help.insert(0, "Available calculation operations:")
 8206
 8207        # Return
 8208        return operations_help
 8209
 8210    def calculation(
 8211        self,
 8212        operations: dict = {},
 8213        operations_config_dict: dict = {},
 8214        operations_config_file: str = None,
 8215    ) -> None:
 8216        """
 8217        It takes a list of operations, and for each operation, it checks if it's a python or sql
 8218        operation, and then calls the appropriate function
 8219
 8220        param json example:
 8221            "calculation": {
 8222                "NOMEN": {
 8223                    "options": {
 8224                        "hgvs_field": "hgvs"
 8225                    },
 8226                "middle" : null
 8227            }
 8228        """
 8229
 8230        # Param
 8231        param = self.get_param()
 8232
 8233        # operations config
 8234        operations_config = self.get_config_json(
 8235            name="calculations",
 8236            config_dict=operations_config_dict,
 8237            config_file=operations_config_file,
 8238        )
 8239
 8240        # Upper keys
 8241        operations_config = {k.upper(): v for k, v in operations_config.items()}
 8242
 8243        # Calculations
 8244
 8245        # Operations from param
 8246        operations = param.get("calculation", {}).get("calculations", operations)
 8247
 8248        # Quick calculation - add
 8249        if param.get("calculations", None):
 8250
 8251            # List of operations
 8252            calculations_list = [
 8253                value.strip() for value in param.get("calculations", "").split(",")
 8254            ]
 8255
 8256            # Log
 8257            log.info(f"Quick Calculations:")
 8258            for calculation_key in calculations_list:
 8259                log.info(f"   {calculation_key}")
 8260
 8261            # Create tmp operations (to keep operation order)
 8262            operations_tmp = {}
 8263            for calculation_operation in calculations_list:
 8264                if calculation_operation.upper() not in operations_tmp:
 8265                    log.debug(
 8266                        f"{calculation_operation}.upper() not in {operations_tmp}"
 8267                    )
 8268                    operations_tmp[calculation_operation.upper()] = {}
 8269                    add_value_into_dict(
 8270                        dict_tree=operations_tmp,
 8271                        sections=[
 8272                            calculation_operation.upper(),
 8273                        ],
 8274                        value=operations.get(calculation_operation.upper(), {}),
 8275                    )
 8276            # Add operations already in param
 8277            for calculation_operation in operations:
 8278                if calculation_operation not in operations_tmp:
 8279                    operations_tmp[calculation_operation] = operations.get(
 8280                        calculation_operation, {}
 8281                    )
 8282
 8283            # Update operations in param
 8284            operations = operations_tmp
 8285
 8286        # Operations for calculation
 8287        if not operations:
 8288            operations = param.get("calculation", {}).get("calculations", {})
 8289
 8290        if operations:
 8291            log.info(f"Calculations...")
 8292
 8293        # For each operations
 8294        for operation_name in operations:
 8295            operation_name = operation_name.upper()
 8296            if operation_name not in [""]:
 8297                if operation_name in operations_config:
 8298                    log.info(f"Calculation '{operation_name}'")
 8299                    operation = operations_config[operation_name]
 8300                    operation_type = operation.get("type", "sql")
 8301                    if operation_type == "python":
 8302                        self.calculation_process_function(
 8303                            operation=operation, operation_name=operation_name
 8304                        )
 8305                    elif operation_type == "sql":
 8306                        self.calculation_process_sql(
 8307                            operation=operation, operation_name=operation_name
 8308                        )
 8309                    else:
 8310                        log.error(
 8311                            f"Operations config: Type '{operation_type}' NOT available"
 8312                        )
 8313                        raise ValueError(
 8314                            f"Operations config: Type '{operation_type}' NOT available"
 8315                        )
 8316                else:
 8317                    log.error(
 8318                        f"Operations config: Calculation '{operation_name}' NOT available"
 8319                    )
 8320                    raise ValueError(
 8321                        f"Operations config: Calculation '{operation_name}' NOT available"
 8322                    )
 8323
 8324        # Explode INFOS fields into table fields
 8325        if self.get_explode_infos():
 8326            self.explode_infos(
 8327                prefix=self.get_explode_infos_prefix(),
 8328                fields=self.get_explode_infos_fields(),
 8329                force=True,
 8330            )
 8331
 8332    def calculation_process_sql(
 8333        self, operation: dict, operation_name: str = "unknown"
 8334    ) -> None:
 8335        """
 8336        The `calculation_process_sql` function takes in a mathematical operation as a string and
 8337        performs the operation, updating the specified table with the result.
 8338
 8339        :param operation: The `operation` parameter is a dictionary that contains information about the
 8340        mathematical operation to be performed. It includes the following keys:
 8341        :type operation: dict
 8342        :param operation_name: The `operation_name` parameter is a string that represents the name of
 8343        the mathematical operation being performed. It is used for logging and error handling purposes,
 8344        defaults to unknown
 8345        :type operation_name: str (optional)
 8346        """
 8347
 8348        # Operation infos
 8349        operation_name = operation.get("name", "unknown")
 8350        log.debug(f"process sql {operation_name}")
 8351        output_column_name = operation.get("output_column_name", operation_name)
 8352        output_column_type = operation.get("output_column_type", "String")
 8353        prefix = operation.get("explode_infos_prefix", "")
 8354        output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR")
 8355        output_column_description = operation.get(
 8356            "output_column_description", f"{operation_name} operation"
 8357        )
 8358        operation_query = operation.get("operation_query", None)
 8359        if isinstance(operation_query, list):
 8360            operation_query = " ".join(operation_query)
 8361        operation_info_fields = operation.get("info_fields", [])
 8362        operation_info_fields_check = operation.get("info_fields_check", False)
 8363        operation_info = operation.get("operation_info", True)
 8364        operation_table = operation.get(
 8365            "table", self.get_table_variants(clause="alter")
 8366        )
 8367
 8368        # table variants
 8369        if operation_table:
 8370            table_variants = operation_table
 8371        else:
 8372            table_variants = self.get_table_variants(clause="alter")
 8373
 8374        if operation_query:
 8375
 8376            # Info fields check
 8377            operation_info_fields_check_result = True
 8378            if operation_info_fields_check:
 8379                header_infos = self.get_header().infos
 8380                for info_field in operation_info_fields:
 8381                    operation_info_fields_check_result = (
 8382                        operation_info_fields_check_result
 8383                        and info_field in header_infos
 8384                    )
 8385
 8386            # If info fields available
 8387            if operation_info_fields_check_result:
 8388
 8389                # Added_columns
 8390                added_columns = []
 8391
 8392                # Create VCF header field
 8393                vcf_reader = self.get_header()
 8394                vcf_reader.infos[output_column_name] = vcf.parser._Info(
 8395                    output_column_name,
 8396                    ".",
 8397                    output_column_type,
 8398                    output_column_description,
 8399                    "howard calculation",
 8400                    "0",
 8401                    self.code_type_map.get(output_column_type),
 8402                )
 8403
 8404                # Explode infos if needed
 8405                log.debug(f"calculation_process_sql prefix {prefix}")
 8406                added_columns += self.explode_infos(
 8407                    prefix=prefix,
 8408                    fields=[output_column_name] + operation_info_fields,
 8409                    force=False,
 8410                    table=table_variants,
 8411                )
 8412
 8413                # Create column
 8414                added_column = self.add_column(
 8415                    table_name=table_variants,
 8416                    column_name=prefix + output_column_name,
 8417                    column_type=output_column_type_sql,
 8418                    default_value="null",
 8419                )
 8420                added_columns.append(added_column)
 8421
 8422                # Operation calculation
 8423                try:
 8424
 8425                    # Query to update calculation column
 8426                    sql_update = f"""
 8427                        UPDATE {table_variants}
 8428                        SET "{prefix}{output_column_name}" = ({operation_query})
 8429                    """
 8430                    self.conn.execute(sql_update)
 8431
 8432                    # Add to INFO
 8433                    if operation_info:
 8434                        sql_update_info = f"""
 8435                            UPDATE {table_variants}
 8436                            SET "INFO" =
 8437                                concat(
 8438                                    CASE
 8439                                        WHEN "INFO" IS NOT NULL
 8440                                        THEN concat("INFO", ';')
 8441                                        ELSE ''
 8442                                    END,
 8443                                    '{output_column_name}=',
 8444                                    "{prefix}{output_column_name}"
 8445                                )
 8446                            WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('')
 8447                        """
 8448                        self.conn.execute(sql_update_info)
 8449
 8450                except:
 8451                    log.error(
 8452                        f"Operations config: Calculation '{operation_name}' query failed"
 8453                    )
 8454                    raise ValueError(
 8455                        f"Operations config: Calculation '{operation_name}' query failed"
 8456                    )
 8457
 8458                # Remove added columns
 8459                for added_column in added_columns:
 8460                    log.debug(f"added_column: {added_column}")
 8461                    self.drop_column(column=added_column)
 8462
 8463            else:
 8464                log.error(
 8465                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
 8466                )
 8467                raise ValueError(
 8468                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
 8469                )
 8470
 8471        else:
 8472            log.error(
 8473                f"Operations config: Calculation '{operation_name}' query NOT defined"
 8474            )
 8475            raise ValueError(
 8476                f"Operations config: Calculation '{operation_name}' query NOT defined"
 8477            )
 8478
 8479    def calculation_process_function(
 8480        self, operation: dict, operation_name: str = "unknown"
 8481    ) -> None:
 8482        """
 8483        The `calculation_process_function` takes in an operation dictionary and performs the specified
 8484        function with the given parameters.
 8485
 8486        :param operation: The `operation` parameter is a dictionary that contains information about the
 8487        operation to be performed. It has the following keys:
 8488        :type operation: dict
 8489        :param operation_name: The `operation_name` parameter is a string that represents the name of
 8490        the operation being performed. It is used for logging purposes, defaults to unknown
 8491        :type operation_name: str (optional)
 8492        """
 8493
 8494        operation_name = operation["name"]
 8495        log.debug(f"process sql {operation_name}")
 8496        function_name = operation["function_name"]
 8497        function_params = operation["function_params"]
 8498        getattr(self, function_name)(*function_params)
 8499
 8500    def calculation_variant_id(self) -> None:
 8501        """
 8502        The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and
 8503        updates the INFO field of a variants table with the variant ID.
 8504        """
 8505
 8506        # variant_id annotation field
 8507        variant_id_tag = self.get_variant_id_column()
 8508        added_columns = [variant_id_tag]
 8509
 8510        # variant_id hgvs tags"
 8511        vcf_infos_tags = {
 8512            variant_id_tag: "howard variant ID annotation",
 8513        }
 8514
 8515        # Variants table
 8516        table_variants = self.get_table_variants()
 8517
 8518        # Header
 8519        vcf_reader = self.get_header()
 8520
 8521        # Add variant_id to header
 8522        vcf_reader.infos[variant_id_tag] = vcf.parser._Info(
 8523            variant_id_tag,
 8524            ".",
 8525            "String",
 8526            vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"),
 8527            "howard calculation",
 8528            "0",
 8529            self.code_type_map.get("String"),
 8530        )
 8531
 8532        # Update
 8533        sql_update = f"""
 8534            UPDATE {table_variants}
 8535            SET "INFO" = 
 8536                concat(
 8537                    CASE
 8538                        WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8539                        THEN ''
 8540                        ELSE concat("INFO", ';')
 8541                    END,
 8542                    '{variant_id_tag}=',
 8543                    "{variant_id_tag}"
 8544                )
 8545        """
 8546        self.conn.execute(sql_update)
 8547
 8548        # Remove added columns
 8549        for added_column in added_columns:
 8550            self.drop_column(column=added_column)
 8551
 8552    def calculation_extract_snpeff_hgvs(
 8553        self,
 8554        snpeff_hgvs: str = "snpeff_hgvs",
 8555        snpeff_field: str = "ANN",
 8556    ) -> None:
 8557        """
 8558        The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff
 8559        annotation field in a VCF file and adds them as a new column in the variants table.
 8560
 8561        :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs`
 8562        function is used to specify the name of the column that will store the HGVS nomenclatures
 8563        extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to
 8564        snpeff_hgvs
 8565        :type snpeff_hgvs: str (optional)
 8566        :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs`
 8567        function represents the field in the VCF file that contains SnpEff annotations. This field is
 8568        used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults
 8569        to ANN
 8570        :type snpeff_field: str (optional)
 8571        """
 8572
 8573        # Snpeff hgvs tags
 8574        vcf_infos_tags = {
 8575            snpeff_hgvs: "HGVS nomenclatures from snpEff annotation",
 8576        }
 8577
 8578        # Prefix
 8579        prefix = self.get_explode_infos_prefix()
 8580        if prefix:
 8581            prefix = "INFO/"
 8582
 8583        # snpEff fields
 8584        speff_ann_infos = prefix + snpeff_field
 8585        speff_hgvs_infos = prefix + snpeff_hgvs
 8586
 8587        # Variants table
 8588        table_variants = self.get_table_variants()
 8589
 8590        # Header
 8591        vcf_reader = self.get_header()
 8592
 8593        # Add columns
 8594        added_columns = []
 8595
 8596        # Explode HGVS field in column
 8597        added_columns += self.explode_infos(fields=[snpeff_field])
 8598
 8599        if snpeff_field in vcf_reader.infos:
 8600
 8601            log.debug(vcf_reader.infos[snpeff_field])
 8602
 8603            # Extract ANN header
 8604            ann_description = vcf_reader.infos[snpeff_field].desc
 8605            pattern = r"'(.+?)'"
 8606            match = re.search(pattern, ann_description)
 8607            if match:
 8608                ann_header_match = match.group(1).split(" | ")
 8609                ann_header_desc = {}
 8610                for i in range(len(ann_header_match)):
 8611                    ann_header_info = "".join(
 8612                        char for char in ann_header_match[i] if char.isalnum()
 8613                    )
 8614                    ann_header_desc[ann_header_info] = ann_header_match[i]
 8615                if not ann_header_desc:
 8616                    raise ValueError("Invalid header description format")
 8617            else:
 8618                raise ValueError("Invalid header description format")
 8619
 8620            # Create variant id
 8621            variant_id_column = self.get_variant_id_column()
 8622            added_columns += [variant_id_column]
 8623
 8624            # Create dataframe
 8625            dataframe_snpeff_hgvs = self.get_query_to_df(
 8626                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
 8627            )
 8628
 8629            # Create main NOMEN column
 8630            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
 8631                speff_ann_infos
 8632            ].apply(
 8633                lambda x: extract_snpeff_hgvs(
 8634                    str(x), header=list(ann_header_desc.values())
 8635                )
 8636            )
 8637
 8638            # Add snpeff_hgvs to header
 8639            vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info(
 8640                snpeff_hgvs,
 8641                ".",
 8642                "String",
 8643                vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"),
 8644                "howard calculation",
 8645                "0",
 8646                self.code_type_map.get("String"),
 8647            )
 8648
 8649            # Update
 8650            sql_update = f"""
 8651                UPDATE variants
 8652                SET "INFO" = 
 8653                    concat(
 8654                        CASE
 8655                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8656                            THEN ''
 8657                            ELSE concat("INFO", ';')
 8658                        END,
 8659                        CASE 
 8660                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
 8661                            AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
 8662                            THEN concat(
 8663                                    '{snpeff_hgvs}=',
 8664                                    dataframe_snpeff_hgvs."{speff_hgvs_infos}"
 8665                                )
 8666                            ELSE ''
 8667                        END
 8668                    )
 8669                FROM dataframe_snpeff_hgvs
 8670                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
 8671
 8672            """
 8673            self.conn.execute(sql_update)
 8674
 8675            # Delete dataframe
 8676            del dataframe_snpeff_hgvs
 8677            gc.collect()
 8678
 8679        else:
 8680
 8681            log.warning(
 8682                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
 8683            )
 8684
 8685        # Remove added columns
 8686        for added_column in added_columns:
 8687            self.drop_column(column=added_column)
 8688
 8689    def calculation_snpeff_ann_explode(
 8690        self,
 8691        uniquify: bool = True,
 8692        output_format: str = "fields",
 8693        output_prefix: str = "snpeff_",
 8694        snpeff_field: str = "ANN",
 8695    ) -> None:
 8696        """
 8697        The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by
 8698        exploding the HGVS field and updating variant information accordingly.
 8699
 8700        :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a
 8701        boolean flag that determines whether the output should be uniquified or not. When set to `True`,
 8702        it indicates that the output should be unique, meaning that duplicate entries should be removed,
 8703        defaults to True
 8704        :type uniquify: bool (optional)
 8705        :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode`
 8706        function specifies the format in which the output annotations will be generated. It has a
 8707        default value of "fields". You can also set it to "JSON" to output the annotations in JSON
 8708        format, defaults to fields
 8709        :type output_format: str (optional)
 8710        :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode`
 8711        method is used to specify the prefix that will be added to the output annotations generated
 8712        during the calculation process. This prefix helps to differentiate the newly added annotations
 8713        from existing ones in the output data. By default, the, defaults to ANN_
 8714        :type output_prefix: str (optional)
 8715        :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode`
 8716        function is used to specify the field in the VCF file that contains SnpEff annotations. This
 8717        field will be processed to explode the HGVS annotations and update the variant information
 8718        accordingly, defaults to ANN
 8719        :type snpeff_field: str (optional)
 8720        """
 8721
 8722        # SnpEff annotation field
 8723        snpeff_hgvs = "snpeff_ann_explode"
 8724
 8725        # Snpeff hgvs tags
 8726        vcf_infos_tags = {
 8727            snpeff_hgvs: "Explode snpEff annotations",
 8728        }
 8729
 8730        # Prefix
 8731        prefix = self.get_explode_infos_prefix()
 8732        if prefix:
 8733            prefix = "INFO/"
 8734
 8735        # snpEff fields
 8736        speff_ann_infos = prefix + snpeff_field
 8737        speff_hgvs_infos = prefix + snpeff_hgvs
 8738
 8739        # Variants table
 8740        table_variants = self.get_table_variants()
 8741
 8742        # Header
 8743        vcf_reader = self.get_header()
 8744
 8745        # Add columns
 8746        added_columns = []
 8747
 8748        # Explode HGVS field in column
 8749        added_columns += self.explode_infos(fields=[snpeff_field])
 8750        log.debug(f"snpeff_field={snpeff_field}")
 8751        log.debug(f"added_columns={added_columns}")
 8752
 8753        if snpeff_field in vcf_reader.infos:
 8754
 8755            # Extract ANN header
 8756            ann_description = vcf_reader.infos[snpeff_field].desc
 8757            pattern = r"'(.+?)'"
 8758            match = re.search(pattern, ann_description)
 8759            if match:
 8760                ann_header_match = match.group(1).split(" | ")
 8761                ann_header = []
 8762                ann_header_desc = {}
 8763                for i in range(len(ann_header_match)):
 8764                    ann_header_info = "".join(
 8765                        char for char in ann_header_match[i] if char.isalnum()
 8766                    )
 8767                    ann_header.append(ann_header_info)
 8768                    ann_header_desc[ann_header_info] = ann_header_match[i]
 8769                if not ann_header_desc:
 8770                    raise ValueError("Invalid header description format")
 8771            else:
 8772                raise ValueError("Invalid header description format")
 8773
 8774            # Create variant id
 8775            variant_id_column = self.get_variant_id_column()
 8776            added_columns += [variant_id_column]
 8777
 8778            # Create dataframe
 8779            dataframe_snpeff_hgvs = self.get_query_to_df(
 8780                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
 8781            )
 8782
 8783            # Create snpEff columns
 8784            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
 8785                speff_ann_infos
 8786            ].apply(
 8787                lambda x: explode_snpeff_ann(
 8788                    str(x),
 8789                    uniquify=uniquify,
 8790                    output_format=output_format,
 8791                    prefix=output_prefix,
 8792                    header=list(ann_header_desc.values()),
 8793                )
 8794            )
 8795
 8796            # Header
 8797            ann_annotations_prefix = ""
 8798            if output_format.upper() in ["JSON"]:
 8799                ann_annotations_prefix = f"{output_prefix}="
 8800                vcf_reader.infos[output_prefix] = vcf.parser._Info(
 8801                    output_prefix,
 8802                    ".",
 8803                    "String",
 8804                    vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
 8805                    + " - JSON format",
 8806                    "howard calculation",
 8807                    "0",
 8808                    self.code_type_map.get("String"),
 8809                )
 8810            else:
 8811                for ann_annotation in ann_header:
 8812                    ann_annotation_id = f"{output_prefix}{ann_annotation}"
 8813                    vcf_reader.infos[ann_annotation_id] = vcf.parser._Info(
 8814                        ann_annotation_id,
 8815                        ".",
 8816                        "String",
 8817                        vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
 8818                        + f" - '{ann_header_desc[ann_annotation]}' annotation",
 8819                        "howard calculation",
 8820                        "0",
 8821                        self.code_type_map.get("String"),
 8822                    )
 8823
 8824            # Update
 8825            sql_update = f"""
 8826                UPDATE variants
 8827                SET "INFO" = 
 8828                    concat(
 8829                        CASE
 8830                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8831                            THEN ''
 8832                            ELSE concat("INFO", ';')
 8833                        END,
 8834                        CASE 
 8835                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
 8836                                AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
 8837                            THEN concat(
 8838                                '{ann_annotations_prefix}',
 8839                                dataframe_snpeff_hgvs."{speff_hgvs_infos}"
 8840                                )
 8841                            ELSE ''
 8842                        END
 8843                    )
 8844                FROM dataframe_snpeff_hgvs
 8845                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
 8846
 8847            """
 8848            self.conn.execute(sql_update)
 8849
 8850            # Delete dataframe
 8851            del dataframe_snpeff_hgvs
 8852            gc.collect()
 8853
 8854        else:
 8855
 8856            log.warning(
 8857                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
 8858            )
 8859
 8860        # Remove added columns
 8861        for added_column in added_columns:
 8862            self.drop_column(column=added_column)
 8863
 8864    def calculation_extract_nomen(self) -> None:
 8865        """
 8866        This function extracts the HGVS nomenclature from the calculation/identification of NOMEN.
 8867        """
 8868
 8869        # NOMEN field
 8870        field_nomen_dict = "NOMEN_DICT"
 8871
 8872        # NOMEN structure
 8873        nomen_dict = {
 8874            "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)",
 8875            "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)",
 8876            "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)",
 8877            "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant",
 8878            "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)",
 8879            "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)",
 8880            "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)",
 8881            "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)",
 8882            "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)",
 8883            "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)",
 8884        }
 8885
 8886        # Param
 8887        param = self.get_param()
 8888
 8889        # Prefix
 8890        prefix = self.get_explode_infos_prefix()
 8891
 8892        # Header
 8893        vcf_reader = self.get_header()
 8894
 8895        # Added columns
 8896        added_columns = []
 8897
 8898        # Get HGVS field
 8899        hgvs_field = (
 8900            param.get("calculation", {})
 8901            .get("calculations", {})
 8902            .get("NOMEN", {})
 8903            .get("options", {})
 8904            .get("hgvs_field", "hgvs")
 8905        )
 8906
 8907        # Get NOMEN pattern
 8908        nomen_pattern = (
 8909            param.get("calculation", {})
 8910            .get("calculations", {})
 8911            .get("NOMEN", {})
 8912            .get("options", {})
 8913            .get("pattern", None)
 8914        )
 8915
 8916        # transcripts list of preference sources
 8917        transcripts_sources = {}
 8918
 8919        # Get transcripts
 8920        transcripts_file = (
 8921            param.get("calculation", {})
 8922            .get("calculations", {})
 8923            .get("NOMEN", {})
 8924            .get("options", {})
 8925            .get("transcripts", None)
 8926        )
 8927        transcripts_file = full_path(transcripts_file)
 8928        if transcripts_file:
 8929            if os.path.exists(transcripts_file):
 8930                transcripts_dataframe = transcripts_file_to_df(transcripts_file)
 8931                transcripts_from_file = transcripts_dataframe.iloc[:, 0].tolist()
 8932                transcripts_sources["file"] = transcripts_from_file
 8933            else:
 8934                msg_err = f"Transcript file '{transcripts_file}' does NOT exist"
 8935                log.error(msg_err)
 8936                raise ValueError(msg_err)
 8937
 8938        # Get transcripts table
 8939        transcripts_table = (
 8940            param.get("calculation", {})
 8941            .get("calculations", {})
 8942            .get("NOMEN", {})
 8943            .get("options", {})
 8944            .get("transcripts_table", self.get_table_variants())
 8945        )
 8946        # Get transcripts column
 8947        transcripts_column = (
 8948            param.get("calculation", {})
 8949            .get("calculations", {})
 8950            .get("NOMEN", {})
 8951            .get("options", {})
 8952            .get("transcripts_column", None)
 8953        )
 8954
 8955        if transcripts_table and transcripts_column:
 8956            extra_field_transcript = f"{transcripts_table}.{transcripts_column}"
 8957            # Explode if not exists
 8958            self.explode_infos(fields=[transcripts_column], table=transcripts_table)
 8959        else:
 8960            extra_field_transcript = f"NULL"
 8961
 8962        # Transcripts of preference source order
 8963        transcripts_order = (
 8964            param.get("calculation", {})
 8965            .get("calculations", {})
 8966            .get("NOMEN", {})
 8967            .get("options", {})
 8968            .get("transcripts_order", ["column", "file"])
 8969        )
 8970
 8971        # Transcripts from file
 8972        transcripts = transcripts_sources.get("file", [])
 8973
 8974        # Explode HGVS field in column
 8975        added_columns += self.explode_infos(fields=[hgvs_field])
 8976
 8977        # extra infos
 8978        extra_infos = self.get_extra_infos()
 8979        extra_field = prefix + hgvs_field
 8980
 8981        if extra_field in extra_infos:
 8982
 8983            # Create dataframe
 8984            dataframe_hgvs = self.get_query_to_df(
 8985                f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" AS hgvs, {extra_field_transcript} AS transcript FROM variants """
 8986            )
 8987
 8988            # Create main NOMEN column
 8989            dataframe_hgvs[field_nomen_dict] = dataframe_hgvs.apply(
 8990                lambda x: find_nomen(
 8991                    hgvs=x.hgvs,
 8992                    transcript=x.transcript,
 8993                    transcripts=transcripts,
 8994                    pattern=nomen_pattern,
 8995                    transcripts_source_order=transcripts_order,
 8996                ),
 8997                axis=1,
 8998            )
 8999
 9000            # Explode NOMEN Structure and create SQL set for update
 9001            sql_nomen_fields = []
 9002            for nomen_field in nomen_dict:
 9003
 9004                # Explode each field into a column
 9005                dataframe_hgvs[nomen_field] = dataframe_hgvs[field_nomen_dict].apply(
 9006                    lambda x: dict(x).get(nomen_field, "")
 9007                )
 9008
 9009                # Create VCF header field
 9010                vcf_reader.infos[nomen_field] = vcf.parser._Info(
 9011                    nomen_field,
 9012                    ".",
 9013                    "String",
 9014                    nomen_dict.get(nomen_field, "howard calculation NOMEN"),
 9015                    "howard calculation",
 9016                    "0",
 9017                    self.code_type_map.get("String"),
 9018                )
 9019                sql_nomen_fields.append(
 9020                    f"""
 9021                        CASE 
 9022                            WHEN dataframe_hgvs."{nomen_field}" NOT NULL AND dataframe_hgvs."{nomen_field}" NOT IN ('')
 9023                            THEN concat(
 9024                                    ';{nomen_field}=',
 9025                                    dataframe_hgvs."{nomen_field}"
 9026                                )
 9027                            ELSE ''
 9028                        END
 9029                    """
 9030                )
 9031
 9032            # SQL set for update
 9033            sql_nomen_fields_set = ", ".join(sql_nomen_fields)
 9034
 9035            # Update
 9036            sql_update = f"""
 9037                UPDATE variants
 9038                SET "INFO" = 
 9039                    concat(
 9040                        CASE
 9041                            WHEN "INFO" IS NULL
 9042                            THEN ''
 9043                            ELSE "INFO"
 9044                        END,
 9045                        {sql_nomen_fields_set}
 9046                    )
 9047                FROM dataframe_hgvs
 9048                WHERE variants."#CHROM" = dataframe_hgvs."#CHROM"
 9049                    AND variants."POS" = dataframe_hgvs."POS" 
 9050                    AND variants."REF" = dataframe_hgvs."REF"
 9051                    AND variants."ALT" = dataframe_hgvs."ALT"
 9052            """
 9053            self.conn.execute(sql_update)
 9054
 9055            # Delete dataframe
 9056            del dataframe_hgvs
 9057            gc.collect()
 9058
 9059        # Remove added columns
 9060        for added_column in added_columns:
 9061            self.drop_column(column=added_column)
 9062
 9063    def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None:
 9064        """
 9065        The function `calculation_find_by_pipeline` performs a calculation to find the number of
 9066        pipeline/sample for a variant and updates the variant information in a VCF file.
 9067
 9068        :param tag: The `tag` parameter is a string that represents the annotation field for the
 9069        "findbypipeline" information in the VCF file. It is used to create the annotation field in the
 9070        VCF header and to update the corresponding field in the variants table, defaults to
 9071        findbypipeline
 9072        :type tag: str (optional)
 9073        """
 9074
 9075        # if FORMAT and samples
 9076        if (
 9077            "FORMAT" in self.get_header_columns_as_list()
 9078            and self.get_header_sample_list()
 9079        ):
 9080
 9081            # findbypipeline annotation field
 9082            findbypipeline_tag = tag
 9083
 9084            # VCF infos tags
 9085            vcf_infos_tags = {
 9086                findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})",
 9087            }
 9088
 9089            # Prefix
 9090            prefix = self.get_explode_infos_prefix()
 9091
 9092            # Field
 9093            findbypipeline_infos = prefix + findbypipeline_tag
 9094
 9095            # Variants table
 9096            table_variants = self.get_table_variants()
 9097
 9098            # Header
 9099            vcf_reader = self.get_header()
 9100
 9101            # Create variant id
 9102            variant_id_column = self.get_variant_id_column()
 9103            added_columns = [variant_id_column]
 9104
 9105            # variant_id, FORMAT and samples
 9106            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9107                self.get_header_sample_list()
 9108            )
 9109
 9110            # Create dataframe
 9111            dataframe_findbypipeline = self.get_query_to_df(
 9112                f""" SELECT {samples_fields} FROM {table_variants} """
 9113            )
 9114
 9115            # Create findbypipeline column
 9116            dataframe_findbypipeline[findbypipeline_infos] = (
 9117                dataframe_findbypipeline.apply(
 9118                    lambda row: findbypipeline(
 9119                        row, samples=self.get_header_sample_list()
 9120                    ),
 9121                    axis=1,
 9122                )
 9123            )
 9124
 9125            # Add snpeff_hgvs to header
 9126            vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info(
 9127                findbypipeline_tag,
 9128                ".",
 9129                "String",
 9130                vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"),
 9131                "howard calculation",
 9132                "0",
 9133                self.code_type_map.get("String"),
 9134            )
 9135
 9136            # Update
 9137            sql_update = f"""
 9138                UPDATE variants
 9139                SET "INFO" = 
 9140                    concat(
 9141                        CASE
 9142                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9143                            THEN ''
 9144                            ELSE concat("INFO", ';')
 9145                        END,
 9146                        CASE 
 9147                            WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.')
 9148                                AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL
 9149                            THEN concat(
 9150                                    '{findbypipeline_tag}=',
 9151                                    dataframe_findbypipeline."{findbypipeline_infos}"
 9152                                )
 9153                            ELSE ''
 9154                        END
 9155                    )
 9156                FROM dataframe_findbypipeline
 9157                WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}"
 9158            """
 9159            self.conn.execute(sql_update)
 9160
 9161            # Remove added columns
 9162            for added_column in added_columns:
 9163                self.drop_column(column=added_column)
 9164
 9165            # Delete dataframe
 9166            del dataframe_findbypipeline
 9167            gc.collect()
 9168
 9169    def calculation_genotype_concordance(self) -> None:
 9170        """
 9171        The function `calculation_genotype_concordance` calculates the genotype concordance for
 9172        multi-caller VCF files and updates the variant information in the database.
 9173        """
 9174
 9175        # if FORMAT and samples
 9176        if (
 9177            "FORMAT" in self.get_header_columns_as_list()
 9178            and self.get_header_sample_list()
 9179        ):
 9180
 9181            # genotypeconcordance annotation field
 9182            genotypeconcordance_tag = "genotypeconcordance"
 9183
 9184            # VCF infos tags
 9185            vcf_infos_tags = {
 9186                genotypeconcordance_tag: "Concordance of genotype for multi caller VCF",
 9187            }
 9188
 9189            # Prefix
 9190            prefix = self.get_explode_infos_prefix()
 9191
 9192            # Field
 9193            genotypeconcordance_infos = prefix + genotypeconcordance_tag
 9194
 9195            # Variants table
 9196            table_variants = self.get_table_variants()
 9197
 9198            # Header
 9199            vcf_reader = self.get_header()
 9200
 9201            # Create variant id
 9202            variant_id_column = self.get_variant_id_column()
 9203            added_columns = [variant_id_column]
 9204
 9205            # variant_id, FORMAT and samples
 9206            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9207                self.get_header_sample_list()
 9208            )
 9209
 9210            # Create dataframe
 9211            dataframe_genotypeconcordance = self.get_query_to_df(
 9212                f""" SELECT {samples_fields} FROM {table_variants} """
 9213            )
 9214
 9215            # Create genotypeconcordance column
 9216            dataframe_genotypeconcordance[genotypeconcordance_infos] = (
 9217                dataframe_genotypeconcordance.apply(
 9218                    lambda row: genotypeconcordance(
 9219                        row, samples=self.get_header_sample_list()
 9220                    ),
 9221                    axis=1,
 9222                )
 9223            )
 9224
 9225            # Add genotypeconcordance to header
 9226            vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info(
 9227                genotypeconcordance_tag,
 9228                ".",
 9229                "String",
 9230                vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"),
 9231                "howard calculation",
 9232                "0",
 9233                self.code_type_map.get("String"),
 9234            )
 9235
 9236            # Update
 9237            sql_update = f"""
 9238                UPDATE variants
 9239                SET "INFO" = 
 9240                    concat(
 9241                        CASE
 9242                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9243                            THEN ''
 9244                            ELSE concat("INFO", ';')
 9245                        END,
 9246                        CASE
 9247                            WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.')
 9248                                AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL
 9249                            THEN concat(
 9250                                    '{genotypeconcordance_tag}=',
 9251                                    dataframe_genotypeconcordance."{genotypeconcordance_infos}"
 9252                                )
 9253                            ELSE ''
 9254                        END
 9255                    )
 9256                FROM dataframe_genotypeconcordance
 9257                WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}"
 9258            """
 9259            self.conn.execute(sql_update)
 9260
 9261            # Remove added columns
 9262            for added_column in added_columns:
 9263                self.drop_column(column=added_column)
 9264
 9265            # Delete dataframe
 9266            del dataframe_genotypeconcordance
 9267            gc.collect()
 9268
 9269    def calculation_barcode(self, tag: str = "barcode") -> None:
 9270        """
 9271        The `calculation_barcode` function calculates barcode values for variants in a VCF file and
 9272        updates the INFO field in the file with the calculated barcode values.
 9273
 9274        :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag
 9275        name that will be used for the barcode calculation in the VCF file. If no tag name is provided,
 9276        the default tag name is set to "barcode", defaults to barcode
 9277        :type tag: str (optional)
 9278        """
 9279
 9280        # if FORMAT and samples
 9281        if (
 9282            "FORMAT" in self.get_header_columns_as_list()
 9283            and self.get_header_sample_list()
 9284        ):
 9285
 9286            # barcode annotation field
 9287            if not tag:
 9288                tag = "barcode"
 9289
 9290            # VCF infos tags
 9291            vcf_infos_tags = {
 9292                tag: "barcode calculation (VaRank)",
 9293            }
 9294
 9295            # Prefix
 9296            prefix = self.get_explode_infos_prefix()
 9297
 9298            # Field
 9299            barcode_infos = prefix + tag
 9300
 9301            # Variants table
 9302            table_variants = self.get_table_variants()
 9303
 9304            # Header
 9305            vcf_reader = self.get_header()
 9306
 9307            # Create variant id
 9308            variant_id_column = self.get_variant_id_column()
 9309            added_columns = [variant_id_column]
 9310
 9311            # variant_id, FORMAT and samples
 9312            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9313                self.get_header_sample_list()
 9314            )
 9315
 9316            # Create dataframe
 9317            dataframe_barcode = self.get_query_to_df(
 9318                f""" SELECT {samples_fields} FROM {table_variants} """
 9319            )
 9320
 9321            # Create barcode column
 9322            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
 9323                lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1
 9324            )
 9325
 9326            # Add barcode to header
 9327            vcf_reader.infos[tag] = vcf.parser._Info(
 9328                tag,
 9329                ".",
 9330                "String",
 9331                vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)),
 9332                "howard calculation",
 9333                "0",
 9334                self.code_type_map.get("String"),
 9335            )
 9336
 9337            # Update
 9338            sql_update = f"""
 9339                UPDATE {table_variants}
 9340                SET "INFO" = 
 9341                    concat(
 9342                        CASE
 9343                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9344                            THEN ''
 9345                            ELSE concat("INFO", ';')
 9346                        END,
 9347                        CASE
 9348                            WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.')
 9349                            AND dataframe_barcode."{barcode_infos}" NOT NULL
 9350                            THEN concat(
 9351                                    '{tag}=',
 9352                                    dataframe_barcode."{barcode_infos}"
 9353                                )
 9354                            ELSE ''
 9355                        END
 9356                    )
 9357                FROM dataframe_barcode
 9358                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
 9359            """
 9360            self.conn.execute(sql_update)
 9361
 9362            # Remove added columns
 9363            for added_column in added_columns:
 9364                self.drop_column(column=added_column)
 9365
 9366            # Delete dataframe
 9367            del dataframe_barcode
 9368            gc.collect()
 9369
 9370    def calculation_barcode_family(self, tag: str = "BCF") -> None:
 9371        """
 9372        The `calculation_barcode_family` function calculates barcode values for variants in a VCF file
 9373        and updates the INFO field in the file with the calculated barcode values.
 9374
 9375        :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify
 9376        the barcode tag that will be added to the VCF file during the calculation process. If no value
 9377        is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF
 9378        :type tag: str (optional)
 9379        """
 9380
 9381        # if FORMAT and samples
 9382        if (
 9383            "FORMAT" in self.get_header_columns_as_list()
 9384            and self.get_header_sample_list()
 9385        ):
 9386
 9387            # barcode annotation field
 9388            if not tag:
 9389                tag = "BCF"
 9390
 9391            # VCF infos tags
 9392            vcf_infos_tags = {
 9393                tag: "barcode family calculation",
 9394                f"{tag}S": "barcode family samples",
 9395            }
 9396
 9397            # Param
 9398            param = self.get_param()
 9399            log.debug(f"param={param}")
 9400
 9401            # Prefix
 9402            prefix = self.get_explode_infos_prefix()
 9403
 9404            # PED param
 9405            ped = (
 9406                param.get("calculation", {})
 9407                .get("calculations", {})
 9408                .get("BARCODEFAMILY", {})
 9409                .get("family_pedigree", None)
 9410            )
 9411            log.debug(f"ped={ped}")
 9412
 9413            # Load PED
 9414            if ped:
 9415
 9416                # Pedigree is a file
 9417                if isinstance(ped, str) and os.path.exists(full_path(ped)):
 9418                    log.debug("Pedigree is file")
 9419                    with open(full_path(ped)) as ped:
 9420                        ped = json.load(ped)
 9421
 9422                # Pedigree is a string
 9423                elif isinstance(ped, str):
 9424                    log.debug("Pedigree is str")
 9425                    try:
 9426                        ped = json.loads(ped)
 9427                        log.debug("Pedigree is json str")
 9428                    except ValueError as e:
 9429                        ped_samples = ped.split(",")
 9430                        ped = {}
 9431                        for ped_sample in ped_samples:
 9432                            ped[ped_sample] = ped_sample
 9433
 9434                # Pedigree is a dict
 9435                elif isinstance(ped, dict):
 9436                    log.debug("Pedigree is dict")
 9437
 9438                # Pedigree is not well formatted
 9439                else:
 9440                    msg_error = "Pedigree not well formatted"
 9441                    log.error(msg_error)
 9442                    raise ValueError(msg_error)
 9443
 9444                # Construct list
 9445                ped_samples = list(ped.values())
 9446
 9447            else:
 9448                log.debug("Pedigree not defined. Take all samples")
 9449                ped_samples = self.get_header_sample_list()
 9450                ped = {}
 9451                for ped_sample in ped_samples:
 9452                    ped[ped_sample] = ped_sample
 9453
 9454            # Check pedigree
 9455            if not ped or len(ped) == 0:
 9456                msg_error = f"Error in pedigree: samples {ped_samples}"
 9457                log.error(msg_error)
 9458                raise ValueError(msg_error)
 9459
 9460            # Log
 9461            log.info(
 9462                "Calculation 'BARCODEFAMILY' - Samples: "
 9463                + ", ".join([f"{member}='{ped[member]}'" for member in ped])
 9464            )
 9465            log.debug(f"ped_samples={ped_samples}")
 9466
 9467            # Field
 9468            barcode_infos = prefix + tag
 9469
 9470            # Variants table
 9471            table_variants = self.get_table_variants()
 9472
 9473            # Header
 9474            vcf_reader = self.get_header()
 9475
 9476            # Create variant id
 9477            variant_id_column = self.get_variant_id_column()
 9478            added_columns = [variant_id_column]
 9479
 9480            # variant_id, FORMAT and samples
 9481            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9482                ped_samples
 9483            )
 9484
 9485            # Create dataframe
 9486            dataframe_barcode = self.get_query_to_df(
 9487                f""" SELECT {samples_fields} FROM {table_variants} """
 9488            )
 9489
 9490            # Create barcode column
 9491            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
 9492                lambda row: barcode(row, samples=ped_samples), axis=1
 9493            )
 9494
 9495            # Add barcode family to header
 9496            # Add vaf_normalization to header
 9497            vcf_reader.formats[tag] = vcf.parser._Format(
 9498                id=tag,
 9499                num=".",
 9500                type="String",
 9501                desc=vcf_infos_tags.get(tag, "barcode family calculation"),
 9502                type_code=self.code_type_map.get("String"),
 9503            )
 9504            vcf_reader.formats[f"{tag}S"] = vcf.parser._Format(
 9505                id=f"{tag}S",
 9506                num=".",
 9507                type="String",
 9508                desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"),
 9509                type_code=self.code_type_map.get("String"),
 9510            )
 9511
 9512            # Update
 9513            # for sample in ped_samples:
 9514            sql_update_set = []
 9515            for sample in self.get_header_sample_list() + ["FORMAT"]:
 9516                if sample in ped_samples:
 9517                    value = f'dataframe_barcode."{barcode_infos}"'
 9518                    value_samples = "'" + ",".join(ped_samples) + "'"
 9519                elif sample == "FORMAT":
 9520                    value = f"'{tag}'"
 9521                    value_samples = f"'{tag}S'"
 9522                else:
 9523                    value = "'.'"
 9524                    value_samples = "'.'"
 9525                format_regex = r"[a-zA-Z0-9\s]"
 9526                sql_update_set.append(
 9527                    f"""
 9528                        "{sample}" = 
 9529                        concat(
 9530                            CASE
 9531                                WHEN {table_variants}."{sample}" = './.'
 9532                                THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g'))
 9533                                ELSE {table_variants}."{sample}"
 9534                            END,
 9535                            ':',
 9536                            {value},
 9537                            ':',
 9538                            {value_samples}
 9539                        )
 9540                    """
 9541                )
 9542
 9543            sql_update_set_join = ", ".join(sql_update_set)
 9544            sql_update = f"""
 9545                UPDATE {table_variants}
 9546                SET {sql_update_set_join}
 9547                FROM dataframe_barcode
 9548                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
 9549            """
 9550            self.conn.execute(sql_update)
 9551
 9552            # Remove added columns
 9553            for added_column in added_columns:
 9554                self.drop_column(column=added_column)
 9555
 9556            # Delete dataframe
 9557            del dataframe_barcode
 9558            gc.collect()
 9559
 9560    def calculation_trio(self) -> None:
 9561        """
 9562        The `calculation_trio` function performs trio calculations on a VCF file by adding trio
 9563        information to the INFO field of each variant.
 9564        """
 9565
 9566        # if FORMAT and samples
 9567        if (
 9568            "FORMAT" in self.get_header_columns_as_list()
 9569            and self.get_header_sample_list()
 9570        ):
 9571
 9572            # trio annotation field
 9573            trio_tag = "trio"
 9574
 9575            # VCF infos tags
 9576            vcf_infos_tags = {
 9577                "trio": "trio calculation",
 9578            }
 9579
 9580            # Param
 9581            param = self.get_param()
 9582
 9583            # Prefix
 9584            prefix = self.get_explode_infos_prefix()
 9585
 9586            # Trio param
 9587            trio_ped = (
 9588                param.get("calculation", {})
 9589                .get("calculations", {})
 9590                .get("TRIO", {})
 9591                .get("trio_pedigree", None)
 9592            )
 9593
 9594            # Load trio
 9595            if trio_ped:
 9596
 9597                # Trio pedigree is a file
 9598                if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)):
 9599                    log.debug("TRIO pedigree is file")
 9600                    with open(full_path(trio_ped)) as trio_ped:
 9601                        trio_ped = json.load(trio_ped)
 9602
 9603                # Trio pedigree is a string
 9604                elif isinstance(trio_ped, str):
 9605                    log.debug("TRIO pedigree is str")
 9606                    try:
 9607                        trio_ped = json.loads(trio_ped)
 9608                        log.debug("TRIO pedigree is json str")
 9609                    except ValueError as e:
 9610                        trio_samples = trio_ped.split(",")
 9611                        if len(trio_samples) == 3:
 9612                            trio_ped = {
 9613                                "father": trio_samples[0],
 9614                                "mother": trio_samples[1],
 9615                                "child": trio_samples[2],
 9616                            }
 9617                            log.debug("TRIO pedigree is list str")
 9618                        else:
 9619                            msg_error = "TRIO pedigree not well formatted"
 9620                            log.error(msg_error)
 9621                            raise ValueError(msg_error)
 9622
 9623                # Trio pedigree is a dict
 9624                elif isinstance(trio_ped, dict):
 9625                    log.debug("TRIO pedigree is dict")
 9626
 9627                # Trio pedigree is not well formatted
 9628                else:
 9629                    msg_error = "TRIO pedigree not well formatted"
 9630                    log.error(msg_error)
 9631                    raise ValueError(msg_error)
 9632
 9633                # Construct trio list
 9634                trio_samples = [
 9635                    trio_ped.get("father", ""),
 9636                    trio_ped.get("mother", ""),
 9637                    trio_ped.get("child", ""),
 9638                ]
 9639
 9640            else:
 9641                log.debug("TRIO pedigree not defined. Take the first 3 samples")
 9642                samples_list = self.get_header_sample_list()
 9643                if len(samples_list) >= 3:
 9644                    trio_samples = self.get_header_sample_list()[0:3]
 9645                    trio_ped = {
 9646                        "father": trio_samples[0],
 9647                        "mother": trio_samples[1],
 9648                        "child": trio_samples[2],
 9649                    }
 9650                else:
 9651                    msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}"
 9652                    log.error(msg_error)
 9653                    raise ValueError(msg_error)
 9654
 9655            # Check trio pedigree
 9656            if not trio_ped or len(trio_ped) != 3:
 9657                msg_error = f"Error in TRIO pedigree: {trio_ped}"
 9658                log.error(msg_error)
 9659                raise ValueError(msg_error)
 9660
 9661            # Log
 9662            log.info(
 9663                f"Calculation 'TRIO' - Samples: "
 9664                + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped])
 9665            )
 9666
 9667            # Field
 9668            trio_infos = prefix + trio_tag
 9669
 9670            # Variants table
 9671            table_variants = self.get_table_variants()
 9672
 9673            # Header
 9674            vcf_reader = self.get_header()
 9675
 9676            # Create variant id
 9677            variant_id_column = self.get_variant_id_column()
 9678            added_columns = [variant_id_column]
 9679
 9680            # variant_id, FORMAT and samples
 9681            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9682                self.get_header_sample_list()
 9683            )
 9684
 9685            # Create dataframe
 9686            dataframe_trio = self.get_query_to_df(
 9687                f""" SELECT {samples_fields} FROM {table_variants} """
 9688            )
 9689
 9690            # Create trio column
 9691            dataframe_trio[trio_infos] = dataframe_trio.apply(
 9692                lambda row: trio(row, samples=trio_samples), axis=1
 9693            )
 9694
 9695            # Add trio to header
 9696            vcf_reader.infos[trio_tag] = vcf.parser._Info(
 9697                trio_tag,
 9698                ".",
 9699                "String",
 9700                vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"),
 9701                "howard calculation",
 9702                "0",
 9703                self.code_type_map.get("String"),
 9704            )
 9705
 9706            # Update
 9707            sql_update = f"""
 9708                UPDATE {table_variants}
 9709                SET "INFO" = 
 9710                    concat(
 9711                        CASE
 9712                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9713                            THEN ''
 9714                            ELSE concat("INFO", ';')
 9715                        END,
 9716                        CASE
 9717                            WHEN dataframe_trio."{trio_infos}" NOT IN ('','.')
 9718                             AND dataframe_trio."{trio_infos}" NOT NULL
 9719                            THEN concat(
 9720                                    '{trio_tag}=',
 9721                                    dataframe_trio."{trio_infos}"
 9722                                )
 9723                            ELSE ''
 9724                        END
 9725                    )
 9726                FROM dataframe_trio
 9727                WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}"
 9728            """
 9729            self.conn.execute(sql_update)
 9730
 9731            # Remove added columns
 9732            for added_column in added_columns:
 9733                self.drop_column(column=added_column)
 9734
 9735            # Delete dataframe
 9736            del dataframe_trio
 9737            gc.collect()
 9738
 9739    def calculation_vaf_normalization(self) -> None:
 9740        """
 9741        The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency)
 9742        normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly.
 9743        :return: The function does not return anything.
 9744        """
 9745
 9746        # if FORMAT and samples
 9747        if (
 9748            "FORMAT" in self.get_header_columns_as_list()
 9749            and self.get_header_sample_list()
 9750        ):
 9751
 9752            # vaf_normalization annotation field
 9753            vaf_normalization_tag = "VAF"
 9754
 9755            # VCF infos tags
 9756            vcf_infos_tags = {
 9757                "VAF": "VAF Variant Frequency",
 9758            }
 9759
 9760            # Prefix
 9761            prefix = self.get_explode_infos_prefix()
 9762
 9763            # Variants table
 9764            table_variants = self.get_table_variants()
 9765
 9766            # Header
 9767            vcf_reader = self.get_header()
 9768
 9769            # Do not calculate if VAF already exists
 9770            if "VAF" in vcf_reader.formats:
 9771                log.debug("VAF already on genotypes")
 9772                return
 9773
 9774            # Create variant id
 9775            variant_id_column = self.get_variant_id_column()
 9776            added_columns = [variant_id_column]
 9777
 9778            # variant_id, FORMAT and samples
 9779            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9780                f""" "{sample}" """ for sample in self.get_header_sample_list()
 9781            )
 9782
 9783            # Create dataframe
 9784            query = f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """
 9785            log.debug(f"query={query}")
 9786            dataframe_vaf_normalization = self.get_query_to_df(query=query)
 9787
 9788            vaf_normalization_set = []
 9789
 9790            # for each sample vaf_normalization
 9791            for sample in self.get_header_sample_list():
 9792                dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply(
 9793                    lambda row: vaf_normalization(row, sample=sample), axis=1
 9794                )
 9795                vaf_normalization_set.append(
 9796                    f""" "{sample}" = dataframe_vaf_normalization."{sample}" """
 9797                )
 9798
 9799            # Add VAF to FORMAT
 9800            dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[
 9801                "FORMAT"
 9802            ].apply(lambda x: str(x) + ":VAF")
 9803            vaf_normalization_set.append(
 9804                f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """
 9805            )
 9806
 9807            # Add vaf_normalization to header
 9808            vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format(
 9809                id=vaf_normalization_tag,
 9810                num="1",
 9811                type="Float",
 9812                desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"),
 9813                type_code=self.code_type_map.get("Float"),
 9814            )
 9815
 9816            # Create fields to add in INFO
 9817            sql_vaf_normalization_set = " , ".join(vaf_normalization_set)
 9818
 9819            # Update
 9820            sql_update = f"""
 9821                UPDATE {table_variants}
 9822                SET {sql_vaf_normalization_set}
 9823                FROM dataframe_vaf_normalization
 9824                WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}"
 9825
 9826            """
 9827            self.conn.execute(sql_update)
 9828
 9829            # Remove added columns
 9830            for added_column in added_columns:
 9831                self.drop_column(column=added_column)
 9832
 9833            # Delete dataframe
 9834            del dataframe_vaf_normalization
 9835            gc.collect()
 9836
 9837    def calculation_genotype_stats(self, info: str = "VAF") -> None:
 9838        """
 9839        The `calculation_genotype_stats` function calculates genotype statistics for a given information
 9840        field in a VCF file and updates the INFO column of the variants table with the calculated
 9841        statistics.
 9842
 9843        :param info: The `info` parameter is a string that represents the type of information for which
 9844        genotype statistics are calculated. It is used to generate various VCF info tags for the
 9845        statistics, such as the number of occurrences, the list of values, the minimum value, the
 9846        maximum value, the mean, the median, defaults to VAF
 9847        :type info: str (optional)
 9848        """
 9849
 9850        # if FORMAT and samples
 9851        if (
 9852            "FORMAT" in self.get_header_columns_as_list()
 9853            and self.get_header_sample_list()
 9854        ):
 9855
 9856            # vaf_stats annotation field
 9857            vaf_stats_tag = info + "_stats"
 9858
 9859            # VCF infos tags
 9860            vcf_infos_tags = {
 9861                info + "_stats_nb": f"genotype {info} Statistics - number of {info}",
 9862                info + "_stats_list": f"genotype {info} Statistics - list of {info}",
 9863                info + "_stats_min": f"genotype {info} Statistics - min {info}",
 9864                info + "_stats_max": f"genotype {info} Statistics - max {info}",
 9865                info + "_stats_mean": f"genotype {info} Statistics - mean {info}",
 9866                info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}",
 9867                info
 9868                + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}",
 9869            }
 9870
 9871            # Prefix
 9872            prefix = self.get_explode_infos_prefix()
 9873
 9874            # Field
 9875            vaf_stats_infos = prefix + vaf_stats_tag
 9876
 9877            # Variants table
 9878            table_variants = self.get_table_variants()
 9879
 9880            # Header
 9881            vcf_reader = self.get_header()
 9882
 9883            # Create variant id
 9884            variant_id_column = self.get_variant_id_column()
 9885            added_columns = [variant_id_column]
 9886
 9887            # variant_id, FORMAT and samples
 9888            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9889                self.get_header_sample_list()
 9890            )
 9891
 9892            # Create dataframe
 9893            dataframe_vaf_stats = self.get_query_to_df(
 9894                f""" SELECT {samples_fields} FROM {table_variants} """
 9895            )
 9896
 9897            # Create vaf_stats column
 9898            dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply(
 9899                lambda row: genotype_stats(
 9900                    row, samples=self.get_header_sample_list(), info=info
 9901                ),
 9902                axis=1,
 9903            )
 9904
 9905            # List of vcf tags
 9906            sql_vaf_stats_fields = []
 9907
 9908            # Check all VAF stats infos
 9909            for stat in vcf_infos_tags:
 9910
 9911                # Extract stats
 9912                dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply(
 9913                    lambda x: dict(x).get(stat, "")
 9914                )
 9915
 9916                # Add snpeff_hgvs to header
 9917                vcf_reader.infos[stat] = vcf.parser._Info(
 9918                    stat,
 9919                    ".",
 9920                    "String",
 9921                    vcf_infos_tags.get(stat, "genotype statistics"),
 9922                    "howard calculation",
 9923                    "0",
 9924                    self.code_type_map.get("String"),
 9925                )
 9926
 9927                if len(sql_vaf_stats_fields):
 9928                    sep = ";"
 9929                else:
 9930                    sep = ""
 9931
 9932                # Create fields to add in INFO
 9933                sql_vaf_stats_fields.append(
 9934                    f"""
 9935                        CASE
 9936                            WHEN dataframe_vaf_stats."{stat}" NOT NULL
 9937                            THEN concat(
 9938                                    '{sep}{stat}=',
 9939                                    dataframe_vaf_stats."{stat}"
 9940                                )
 9941                            ELSE ''
 9942                        END
 9943                    """
 9944                )
 9945
 9946            # SQL set for update
 9947            sql_vaf_stats_fields_set = ",  ".join(sql_vaf_stats_fields)
 9948
 9949            # Update
 9950            sql_update = f"""
 9951                UPDATE {table_variants}
 9952                SET "INFO" = 
 9953                    concat(
 9954                        CASE
 9955                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9956                            THEN ''
 9957                            ELSE concat("INFO", ';')
 9958                        END,
 9959                        {sql_vaf_stats_fields_set}
 9960                    )
 9961                FROM dataframe_vaf_stats
 9962                WHERE {table_variants}."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}"
 9963
 9964            """
 9965            self.conn.execute(sql_update)
 9966
 9967            # Remove added columns
 9968            for added_column in added_columns:
 9969                self.drop_column(column=added_column)
 9970
 9971            # Delete dataframe
 9972            del dataframe_vaf_stats
 9973            gc.collect()
 9974
 9975    def calculation_transcripts_annotation(
 9976        self, info_json: str = None, info_format: str = None
 9977    ) -> None:
 9978        """
 9979        The `calculation_transcripts_annotation` function creates a transcripts table and adds an info
 9980        field to it if transcripts are available.
 9981
 9982        :param info_json: The `info_json` parameter in the `calculation_transcripts_annotation` method
 9983        is a string parameter that represents the information field to be used in the transcripts JSON.
 9984        It is used to specify the JSON format for the transcripts information. If no value is provided
 9985        when calling the method, it defaults to "
 9986        :type info_json: str
 9987        :param info_format: The `info_format` parameter in the `calculation_transcripts_annotation`
 9988        method is a string parameter that specifies the format of the information field to be used in
 9989        the transcripts JSON. It is used to define the format of the information field
 9990        :type info_format: str
 9991        """
 9992
 9993        # Create transcripts table
 9994        transcripts_table = self.create_transcript_view()
 9995
 9996        # Add info field
 9997        if transcripts_table:
 9998            self.transcript_view_to_variants(
 9999                transcripts_table=transcripts_table,
10000                transcripts_info_field_json=info_json,
10001                transcripts_info_field_format=info_format,
10002            )
10003        else:
10004            log.info("No Transcripts to process. Check param.json file configuration")
10005
10006    def calculation_transcripts_prioritization(self) -> None:
10007        """
10008        The function `calculation_transcripts_prioritization` creates a transcripts table and
10009        prioritizes transcripts based on certain criteria.
10010        """
10011
10012        # Create transcripts table
10013        transcripts_table = self.create_transcript_view()
10014
10015        # Add info field
10016        if transcripts_table:
10017            self.transcripts_prioritization(transcripts_table=transcripts_table)
10018        else:
10019            log.info("No Transcripts to process. Check param.json file configuration")
10020
10021    def calculation_transcripts_export(self) -> None:
10022        """ """
10023
10024        # Create transcripts table
10025        transcripts_table = self.create_transcript_view()
10026
10027        # Add info field
10028        if transcripts_table:
10029            self.transcripts_export(transcripts_table=transcripts_table)
10030        else:
10031            log.info("No Transcripts to process. Check param.json file configuration")
10032
10033    ###############
10034    # Transcripts #
10035    ###############
10036
10037    def transcripts_export(
10038        self, transcripts_table: str = None, param: dict = {}
10039    ) -> bool:
10040        """ """
10041
10042        log.debug("Start transcripts export...")
10043
10044        # Param
10045        if not param:
10046            param = self.get_param()
10047
10048        # Param export
10049        param_transcript_export = param.get("transcripts", {}).get("export", {})
10050
10051        # Output file
10052        transcripts_export_output = param_transcript_export.get("output", None)
10053
10054        if not param_transcript_export or not transcripts_export_output:
10055            log.warning(f"No transcriipts export parameters defined!")
10056            return False
10057
10058        # List of transcripts annotations
10059        query_describe = f"""
10060            SELECT column_name
10061            FROM (
10062                    DESCRIBE SELECT * FROM {transcripts_table}
10063                )
10064            WHERE column_name NOT IN ('#CHROM', 'POS', 'REF', 'ALT', 'INFO')
10065        """
10066        transcripts_annotations_list = list(
10067            self.get_query_to_df(query=query_describe)["column_name"]
10068        )
10069
10070        # Create transcripts table for export
10071        transcripts_table_export = f"{transcripts_table}_export_" + "".join(
10072            random.choices(string.ascii_uppercase + string.digits, k=10)
10073        )
10074        query_create_transcripts_table_export = f"""
10075            CREATE TABLE {transcripts_table_export} AS (SELECT "#CHROM", "POS", "REF", "ALT", '' AS 'INFO', {', '.join(transcripts_annotations_list)} FROM {transcripts_table})
10076        """
10077        self.execute_query(query=query_create_transcripts_table_export)
10078
10079        # Output file format
10080        transcripts_export_output_format = get_file_format(
10081            filename=transcripts_export_output
10082        )
10083
10084        # Format VCF - construct INFO
10085        if transcripts_export_output_format in ["vcf"]:
10086
10087            # Construct query update INFO and header
10088            query_update_info = []
10089            for field in transcripts_annotations_list:
10090
10091                # If field not in header
10092                if field not in self.get_header_infos_list():
10093
10094                    # Add PZ Transcript in header
10095                    self.get_header().infos[field] = vcf.parser._Info(
10096                        field,
10097                        ".",
10098                        "String",
10099                        f"Annotation '{field}' from transcript view",
10100                        "unknown",
10101                        "unknown",
10102                        0,
10103                    )
10104
10105                # Add field as INFO/tag
10106                query_update_info.append(
10107                    f"""
10108                        CASE
10109                            WHEN "{field}" IS NOT NULL
10110                            THEN concat('{field}=', "{field}", ';')    
10111                            ELSE ''     
10112                        END
10113                        """
10114                )
10115
10116            # Query param
10117            query_update_info_value = (
10118                f""" concat('',  {", ".join(query_update_info)}) """
10119            )
10120            query_export_columns = f""" "#CHROM", "POS", '.' AS 'ID', "REF", "ALT", '.' AS 'QUAL', '.' AS 'FILTER', "INFO" """
10121
10122        else:
10123
10124            # Query param
10125            query_update_info_value = f""" NULL """
10126            query_export_columns = f""" "#CHROM", "POS", "REF", "ALT", {', '.join(transcripts_annotations_list)} """
10127
10128        # Update query INFO column
10129        query_update = f"""
10130            UPDATE {transcripts_table_export}
10131            SET INFO = {query_update_info_value}
10132
10133        """
10134        self.execute_query(query=query_update)
10135
10136        # Export
10137        self.export_output(
10138            output_file=transcripts_export_output,
10139            query=f""" SELECT {query_export_columns} FROM {transcripts_table_export} """,
10140        )
10141
10142        # Drop transcripts export table
10143        query_drop_transcripts_table_export = f"""
10144            DROP TABLE {transcripts_table_export}
10145        """
10146        self.execute_query(query=query_drop_transcripts_table_export)
10147
10148    def transcripts_prioritization(
10149        self, transcripts_table: str = None, param: dict = {}
10150    ) -> bool:
10151        """
10152        The `transcripts_prioritization` function prioritizes transcripts based on certain parameters
10153        and updates the variants table with the prioritized information.
10154
10155        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name
10156        of the table containing transcripts data. If no value is provided, it defaults to "transcripts".
10157        This parameter is used to identify the table where the transcripts data is stored for the
10158        prioritization process
10159        :type transcripts_table: str
10160        :param param: The `param` parameter in the `transcripts_prioritization` method is a dictionary
10161        that contains various configuration settings for the prioritization process of transcripts. It
10162        is used to customize the behavior of the prioritization algorithm and includes settings such as
10163        the prefix for prioritization fields, default profiles, and other
10164        :type param: dict
10165        :return: The function `transcripts_prioritization` returns a boolean value `True` if the
10166        transcripts prioritization process is successfully completed, and `False` if there are any
10167        issues or if no profile is defined for transcripts prioritization.
10168        """
10169
10170        log.debug("Start transcripts prioritization...")
10171
10172        # Param
10173        if not param:
10174            param = self.get_param()
10175
10176        # Variants table
10177        table_variants = self.get_table_variants()
10178
10179        # Transcripts table
10180        if transcripts_table is None:
10181            transcripts_table = self.create_transcript_view(
10182                transcripts_table="transcripts", param=param
10183            )
10184        if transcripts_table is None:
10185            msg_err = "No Transcripts table availalble"
10186            log.error(msg_err)
10187            raise ValueError(msg_err)
10188        log.debug(f"transcripts_table={transcripts_table}")
10189
10190        # Get transcripts columns
10191        columns_as_list_query = f"""
10192            DESCRIBE {transcripts_table}
10193        """
10194        columns_as_list = list(
10195            self.get_query_to_df(columns_as_list_query)["column_name"]
10196        )
10197
10198        # Create INFO if not exists
10199        if "INFO" not in columns_as_list:
10200            query_add_info = f"""
10201                ALTER TABLE {transcripts_table} ADD COLUMN INFO STRING DEFAULT '';
10202            """
10203            self.execute_query(query_add_info)
10204
10205        # Prioritization param and Force only PZ Score and Flag
10206        pz_param = param.get("transcripts", {}).get("prioritization", {})
10207
10208        # PZ profile by default
10209        pz_profile_default = (
10210            param.get("transcripts", {}).get("prioritization", {}).get("profiles", None)
10211        )
10212
10213        # Exit if no profile
10214        if pz_profile_default is None:
10215            log.warning("No profile defined for transcripts prioritization")
10216            return False
10217
10218        # PZ fields
10219        pz_param_pzfields = {}
10220
10221        # PZ field transcripts
10222        pz_fields_transcripts = pz_param.get("pzprefix", "PTZ") + "Transcript"
10223
10224        # Add PZ Transcript in header
10225        self.get_header().infos[pz_fields_transcripts] = vcf.parser._Info(
10226            pz_fields_transcripts,
10227            ".",
10228            "String",
10229            f"Transcript selected from prioritization process, profile {pz_profile_default}",
10230            "unknown",
10231            "unknown",
10232            code_type_map["String"],
10233        )
10234
10235        # Mandatory fields
10236        pz_mandatory_fields_list = [
10237            "Score",
10238            "Flag",
10239            "Tags",
10240            "Comment",
10241            "Infos",
10242            "Class",
10243        ]
10244        pz_mandatory_fields = []
10245        for pz_mandatory_field in pz_mandatory_fields_list:
10246            pz_mandatory_fields.append(
10247                pz_param.get("pzprefix", "PTZ") + pz_mandatory_field
10248            )
10249
10250        # PZ fields in param
10251        for pz_field in pz_param.get("pzfields", []):
10252            if pz_field in pz_mandatory_fields_list:
10253                pz_param_pzfields[pz_param.get("pzprefix", "PTZ") + pz_field] = (
10254                    pz_param.get("pzprefix", "PTZ") + pz_field
10255                )
10256            else:
10257                pz_field_new = pz_param.get("pzprefix", "PTZ") + pz_field
10258                pz_param_pzfields[pz_field] = pz_field_new
10259
10260                # Add PZ Transcript in header
10261                self.get_header().infos[pz_field_new] = vcf.parser._Info(
10262                    pz_field_new,
10263                    ".",
10264                    "String",
10265                    f"Annotation '{pz_field}' from transcript selected from prioritization process, profile {pz_profile_default}",
10266                    "unknown",
10267                    "unknown",
10268                    code_type_map["String"],
10269                )
10270
10271        # PZ fields param
10272        pz_param["pzfields"] = pz_mandatory_fields
10273
10274        # Prioritization
10275        prioritization_result = self.prioritization(
10276            table=transcripts_table,
10277            pz_param=param.get("transcripts", {}).get("prioritization", {}),
10278        )
10279        if not prioritization_result:
10280            log.warning("Transcripts prioritization not processed")
10281            return False
10282
10283        # PZ fields sql query
10284        query_update_select_list = []
10285        query_update_concat_list = []
10286        query_update_order_list = []
10287        for pz_param_pzfield in set(
10288            list(pz_param_pzfields.keys()) + pz_mandatory_fields
10289        ):
10290            query_update_select_list.append(f" {pz_param_pzfield}, ")
10291
10292        for pz_param_pzfield in pz_param_pzfields:
10293            query_update_concat_list.append(
10294                f"""
10295                    , CASE 
10296                        WHEN {pz_param_pzfield} IS NOT NULL
10297                        THEN concat(';{pz_param_pzfields.get(pz_param_pzfield)}=', {pz_param_pzfield})
10298                        ELSE ''
10299                    END
10300                """
10301            )
10302
10303        # Order by
10304        pz_orders = (
10305            param.get("transcripts", {})
10306            .get("prioritization", {})
10307            .get("prioritization_transcripts_order", {})
10308        )
10309        if not pz_orders:
10310            pz_orders = {
10311                pz_param.get("pzprefix", "PTZ") + "Flag": "ASC",
10312                pz_param.get("pzprefix", "PTZ") + "Score": "DESC",
10313            }
10314        for pz_order in pz_orders:
10315            query_update_order_list.append(
10316                f""" {pz_order} {pz_orders.get(pz_order, "DESC")} """
10317            )
10318
10319        # Fields to explode
10320        fields_to_explode = (
10321            list(pz_param_pzfields.keys())
10322            + pz_mandatory_fields
10323            + list(pz_orders.keys())
10324        )
10325        # Remove transcript column as a specific transcript column
10326        if "transcript" in fields_to_explode:
10327            fields_to_explode.remove("transcript")
10328
10329        # Fields intranscripts table
10330        query_transcripts_table = f"""
10331            DESCRIBE SELECT * FROM {transcripts_table}
10332        """
10333        query_transcripts_table = self.get_query_to_df(query=query_transcripts_table)
10334
10335        # Check fields to explode
10336        for field_to_explode in fields_to_explode:
10337            if field_to_explode not in self.get_header_infos_list() + list(
10338                query_transcripts_table.column_name
10339            ):
10340                msg_err = f"INFO/{field_to_explode} NOT IN header"
10341                log.error(msg_err)
10342                raise ValueError(msg_err)
10343
10344        # Explode fields to explode
10345        self.explode_infos(
10346            table=transcripts_table,
10347            fields=fields_to_explode,
10348        )
10349
10350        # Transcript preference file
10351        transcripts_preference_file = (
10352            param.get("transcripts", {})
10353            .get("prioritization", {})
10354            .get("prioritization_transcripts", {})
10355        )
10356        transcripts_preference_file = full_path(transcripts_preference_file)
10357
10358        # Transcript preference forced
10359        transcript_preference_force = (
10360            param.get("transcripts", {})
10361            .get("prioritization", {})
10362            .get("prioritization_transcripts_force", False)
10363        )
10364        # Transcript version forced
10365        transcript_version_force = (
10366            param.get("transcripts", {})
10367            .get("prioritization", {})
10368            .get("prioritization_transcripts_version_force", False)
10369        )
10370
10371        # Transcripts Ranking
10372        if transcripts_preference_file:
10373
10374            # Transcripts file to dataframe
10375            if os.path.exists(transcripts_preference_file):
10376                transcripts_preference_dataframe = transcripts_file_to_df(
10377                    transcripts_preference_file
10378                )
10379            else:
10380                log.error(
10381                    f"Transcript file '{transcripts_preference_file}' does NOT exist"
10382                )
10383                raise ValueError(
10384                    f"Transcript file '{transcripts_preference_file}' does NOT exist"
10385                )
10386
10387            # Order by depending to transcript preference forcing
10388            if transcript_preference_force:
10389                order_by = f""" transcripts_preference.transcripts_preference_order ASC, {" , ".join(query_update_order_list)} """
10390            else:
10391                order_by = f""" {" , ".join(query_update_order_list)}, transcripts_preference.transcripts_preference_order ASC """
10392
10393            # Transcript columns joined depend on version consideration
10394            if transcript_version_force:
10395                transcripts_version_join = f""" {transcripts_table}.transcript = transcripts_preference.transcripts_preference """
10396            else:
10397                transcripts_version_join = f""" split_part({transcripts_table}.transcript, '.', 1) = split_part(transcripts_preference.transcripts_preference, '.', 1) """
10398
10399            # Query ranking for update
10400            query_update_ranking = f"""
10401                SELECT
10402                    "#CHROM", POS, REF, ALT, {transcripts_table}.transcript, {" ".join(query_update_select_list)}
10403                    ROW_NUMBER() OVER (
10404                        PARTITION BY "#CHROM", POS, REF, ALT
10405                        ORDER BY {order_by}
10406                    ) AS rn
10407                FROM {transcripts_table}
10408                LEFT JOIN 
10409                    (
10410                        SELECT transcript AS 'transcripts_preference', row_number() OVER () AS transcripts_preference_order
10411                        FROM transcripts_preference_dataframe
10412                    ) AS transcripts_preference
10413                ON {transcripts_version_join}
10414            """
10415
10416        else:
10417
10418            # Query ranking for update
10419            query_update_ranking = f"""
10420                SELECT
10421                    "#CHROM", POS, REF, ALT, transcript, {" ".join(query_update_select_list)}
10422                    ROW_NUMBER() OVER (
10423                        PARTITION BY "#CHROM", POS, REF, ALT
10424                        ORDER BY {" , ".join(query_update_order_list)}
10425                    ) AS rn
10426                FROM {transcripts_table}
10427            """
10428
10429        # Export Transcripts prioritization infos to variants table
10430        query_update = f"""
10431            WITH RankedTranscripts AS (
10432                {query_update_ranking}
10433            )
10434            UPDATE {table_variants}
10435                SET
10436                INFO = CONCAT(CASE
10437                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
10438                            THEN ''
10439                            ELSE concat("INFO", ';')
10440                        END,
10441                        concat('{pz_fields_transcripts}=', transcript {" ".join(query_update_concat_list)})
10442                        )
10443            FROM
10444                RankedTranscripts
10445            WHERE
10446                rn = 1
10447                AND variants."#CHROM" = RankedTranscripts."#CHROM"
10448                AND variants."POS" = RankedTranscripts."POS"
10449                AND variants."REF" = RankedTranscripts."REF"
10450                AND variants."ALT" = RankedTranscripts."ALT"     
10451        """
10452
10453        # log.debug(f"query_update={query_update}")
10454        self.execute_query(query=query_update)
10455
10456        # Return
10457        return True
10458
10459    def create_transcript_view_from_columns_map(
10460        self,
10461        transcripts_table: str = "transcripts",
10462        columns_maps: dict = {},
10463        added_columns: list = [],
10464        temporary_tables: list = None,
10465        annotation_fields: list = None,
10466        column_rename: dict = {},
10467        column_clean: bool = False,
10468        column_case: str = None,
10469    ) -> tuple[list, list, list]:
10470        """
10471        The `create_transcript_view_from_columns_map` function generates a temporary table view based on
10472        specified columns mapping for transcripts data.
10473
10474        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name
10475        of the table where the transcripts data is stored or will be stored in the database. This table
10476        typically contains information about transcripts such as Ensembl transcript IDs, gene names,
10477        scores, predictions, etc. It defaults to "transcripts, defaults to transcripts
10478        :type transcripts_table: str (optional)
10479        :param columns_maps: The `columns_maps` parameter is a dictionary that contains information
10480        about how to map columns from a transcripts table to create a view. Each entry in the
10481        `columns_maps` list represents a mapping configuration for a specific set of columns. It
10482        typically includes details such as the main transcript column and additional information columns
10483        :type columns_maps: dict
10484        :param added_columns: The `added_columns` parameter in the
10485        `create_transcript_view_from_columns_map` function is a list that stores the additional columns
10486        that will be added to the view being created based on the columns map provided. These columns
10487        are generated by exploding the transcript information columns along with the main transcript
10488        column
10489        :type added_columns: list
10490        :param temporary_tables: The `temporary_tables` parameter in the
10491        `create_transcript_view_from_columns_map` function is a list that stores the names of temporary
10492        tables created during the process of creating a transcript view from a columns map. These
10493        temporary tables are used to store intermediate results or transformations before the final view
10494        is generated
10495        :type temporary_tables: list
10496        :param annotation_fields: The `annotation_fields` parameter in the
10497        `create_transcript_view_from_columns_map` function is a list that stores the fields that are
10498        used for annotation in the query view creation process. These fields are extracted from the
10499        `transcripts_column` and `transcripts_infos_columns` specified in the `columns
10500        :type annotation_fields: list
10501        :param column_rename: The `column_rename` parameter in the
10502        `create_transcript_view_from_columns_map` function is a dictionary that allows you to specify
10503        custom renaming for columns during the creation of the temporary table view. This parameter
10504        provides a mapping of original column names to the desired renamed column names. By using this
10505        parameter,
10506        :type column_rename: dict
10507        :param column_clean: The `column_clean` parameter in the
10508        `create_transcript_view_from_columns_map` function is a boolean flag that determines whether the
10509        column values should be cleaned or not. If set to `True`, the column values will be cleaned by
10510        removing any non-alphanumeric characters from them. This cleaning process ensures, defaults to
10511        False
10512        :type column_clean: bool (optional)
10513        :param column_case: The `column_case` parameter in the `create_transcript_view_from_columns_map`
10514        function is used to specify the case transformation to be applied to the columns during the view
10515        creation process. It allows you to control whether the column values should be converted to
10516        lowercase, uppercase, or remain unchanged
10517        :type column_case: str
10518        :return: The `create_transcript_view_from_columns_map` function returns a tuple containing three
10519        lists: `added_columns`, `temporary_tables`, and `annotation_fields`.
10520        """
10521
10522        log.debug("Start transcrpts view creation from columns map...")
10523
10524        # "from_columns_map": [
10525        #     {
10526        #         "transcripts_column": "Ensembl_transcriptid",
10527        #         "transcripts_infos_columns": [
10528        #             "genename",
10529        #             "Ensembl_geneid",
10530        #             "LIST_S2_score",
10531        #             "LIST_S2_pred",
10532        #         ],
10533        #     },
10534        #     {
10535        #         "transcripts_column": "Ensembl_transcriptid",
10536        #         "transcripts_infos_columns": [
10537        #             "genename",
10538        #             "VARITY_R_score",
10539        #             "Aloft_pred",
10540        #         ],
10541        #     },
10542        # ],
10543
10544        # Init
10545        if temporary_tables is None:
10546            temporary_tables = []
10547        if annotation_fields is None:
10548            annotation_fields = []
10549
10550        # Variants table
10551        table_variants = self.get_table_variants()
10552
10553        for columns_map in columns_maps:
10554
10555            # Transcript column
10556            transcripts_column = columns_map.get("transcripts_column", None)
10557
10558            # Transcripts infos columns
10559            transcripts_infos_columns = columns_map.get("transcripts_infos_columns", [])
10560
10561            # Transcripts infos columns rename
10562            column_rename = columns_map.get("column_rename", column_rename)
10563
10564            # Transcripts infos columns clean
10565            column_clean = columns_map.get("column_clean", column_clean)
10566
10567            # Transcripts infos columns case
10568            column_case = columns_map.get("column_case", column_case)
10569
10570            if transcripts_column is not None:
10571
10572                # Explode
10573                added_columns += self.explode_infos(
10574                    fields=[transcripts_column] + transcripts_infos_columns
10575                )
10576
10577                # View clauses
10578                clause_select_variants = []
10579                clause_select_tanscripts = []
10580                for field in [transcripts_column] + transcripts_infos_columns:
10581
10582                    # AS field
10583                    as_field = field
10584
10585                    # Rename
10586                    if column_rename:
10587                        as_field = column_rename.get(as_field, as_field)
10588
10589                    # Clean
10590                    if column_clean:
10591                        as_field = clean_annotation_field(as_field)
10592
10593                    # Case
10594                    if column_case:
10595                        if column_case.lower() in ["lower"]:
10596                            as_field = as_field.lower()
10597                        elif column_case.lower() in ["upper"]:
10598                            as_field = as_field.upper()
10599
10600                    # Clause select Variants
10601                    clause_select_variants.append(
10602                        f""" regexp_split_to_table("{field}", ',') AS '{field}' """
10603                    )
10604
10605                    if field in [transcripts_column]:
10606                        clause_select_tanscripts.append(
10607                            f""" regexp_split_to_table("{field}", ',') AS '{field}' """
10608                        )
10609                    else:
10610                        clause_select_tanscripts.append(
10611                            f""" regexp_split_to_table("{field}", ',') AS '{as_field}' """
10612                        )
10613                        annotation_fields.append(as_field)
10614
10615                # Querey View
10616                query = f""" 
10617                    SELECT
10618                        "#CHROM", POS, REF, ALT, INFO,
10619                        "{transcripts_column}" AS 'transcript',
10620                        {", ".join(clause_select_tanscripts)}
10621                    FROM (
10622                        SELECT 
10623                            "#CHROM", POS, REF, ALT, INFO,
10624                            {", ".join(clause_select_variants)}
10625                        FROM {table_variants}
10626                        )
10627                    WHERE "{transcripts_column}" IS NOT NULL
10628                """
10629
10630                # Create temporary table
10631                temporary_table = transcripts_table + "".join(
10632                    random.choices(string.ascii_uppercase + string.digits, k=10)
10633                )
10634
10635                # Temporary_tables
10636                temporary_tables.append(temporary_table)
10637                query_view = f"""
10638                    CREATE TEMPORARY TABLE {temporary_table}
10639                    AS ({query})
10640                """
10641                self.execute_query(query=query_view)
10642
10643        return added_columns, temporary_tables, annotation_fields
10644
10645    def create_transcript_view_from_column_format(
10646        self,
10647        transcripts_table: str = "transcripts",
10648        column_formats: dict = {},
10649        temporary_tables: list = None,
10650        annotation_fields: list = None,
10651        column_rename: dict = {},
10652        column_clean: bool = False,
10653        column_case: str = None,
10654    ) -> tuple[list, list, list]:
10655        """
10656        The `create_transcript_view_from_column_format` function generates a transcript view based on
10657        specified column formats, adds additional columns and annotation fields, and returns the list of
10658        temporary tables and annotation fields.
10659
10660        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name
10661        of the table containing the transcripts data. This table will be used as the base table for
10662        creating the transcript view. The default value for this parameter is "transcripts", but you can
10663        provide a different table name if needed, defaults to transcripts
10664        :type transcripts_table: str (optional)
10665        :param column_formats: The `column_formats` parameter is a dictionary that contains information
10666        about the columns to be used for creating the transcript view. Each entry in the dictionary
10667        specifies the mapping between a transcripts column and a transcripts infos column. This
10668        parameter allows you to define how the columns from the transcripts table should be transformed
10669        or mapped
10670        :type column_formats: dict
10671        :param temporary_tables: The `temporary_tables` parameter in the
10672        `create_transcript_view_from_column_format` function is a list that stores the names of
10673        temporary views created during the process of creating a transcript view from a column format.
10674        These temporary views are used to manipulate and extract data before generating the final
10675        transcript view
10676        :type temporary_tables: list
10677        :param annotation_fields: The `annotation_fields` parameter in the
10678        `create_transcript_view_from_column_format` function is a list that stores the annotation fields
10679        that are extracted from the temporary views created during the process. These annotation fields
10680        are obtained by querying the temporary views and extracting the column names excluding specific
10681        columns like `#CH
10682        :type annotation_fields: list
10683        :param column_rename: The `column_rename` parameter in the
10684        `create_transcript_view_from_column_format` function is a dictionary that allows you to specify
10685        custom renaming of columns in the transcripts infos table. By providing a mapping of original
10686        column names to new column names in this dictionary, you can rename specific columns during the
10687        process
10688        :type column_rename: dict
10689        :param column_clean: The `column_clean` parameter in the
10690        `create_transcript_view_from_column_format` function is a boolean flag that determines whether
10691        the transcripts infos columns should undergo a cleaning process. If set to `True`, the columns
10692        will be cleaned during the creation of the transcript view based on the specified column format,
10693        defaults to False
10694        :type column_clean: bool (optional)
10695        :param column_case: The `column_case` parameter in the
10696        `create_transcript_view_from_column_format` function is used to specify the case transformation
10697        to be applied to the columns in the transcript view. It can be set to either "upper" or "lower"
10698        to convert the column names to uppercase or lowercase, respectively
10699        :type column_case: str
10700        :return: The `create_transcript_view_from_column_format` function returns two lists:
10701        `temporary_tables` and `annotation_fields`.
10702        """
10703
10704        log.debug("Start transcrpts view creation from column format...")
10705
10706        #  "from_column_format": [
10707        #     {
10708        #         "transcripts_column": "ANN",
10709        #         "transcripts_infos_column": "Feature_ID",
10710        #     }
10711        # ],
10712
10713        # Init
10714        if temporary_tables is None:
10715            temporary_tables = []
10716        if annotation_fields is None:
10717            annotation_fields = []
10718
10719        for column_format in column_formats:
10720
10721            # annotation field and transcript annotation field
10722            annotation_field = column_format.get("transcripts_column", "ANN")
10723            transcript_annotation = column_format.get(
10724                "transcripts_infos_column", "Feature_ID"
10725            )
10726
10727            # Transcripts infos columns rename
10728            column_rename = column_format.get("column_rename", column_rename)
10729
10730            # Transcripts infos columns clean
10731            column_clean = column_format.get("column_clean", column_clean)
10732
10733            # Transcripts infos columns case
10734            column_case = column_format.get("column_case", column_case)
10735
10736            # Temporary View name
10737            temporary_view_name = transcripts_table + "".join(
10738                random.choices(string.ascii_uppercase + string.digits, k=10)
10739            )
10740
10741            # Create temporary view name
10742            temporary_view_name = self.annotation_format_to_table(
10743                uniquify=True,
10744                annotation_field=annotation_field,
10745                view_name=temporary_view_name,
10746                annotation_id=transcript_annotation,
10747                column_rename=column_rename,
10748                column_clean=column_clean,
10749                column_case=column_case,
10750            )
10751
10752            # Annotation fields
10753            if temporary_view_name:
10754                query_annotation_fields = f"""
10755                    SELECT *
10756                    FROM (
10757                        DESCRIBE SELECT *
10758                        FROM {temporary_view_name}
10759                        )
10760                        WHERE column_name not in ('#CHROM', 'POS', 'REF', 'ALT')
10761                """
10762                df_annotation_fields = self.get_query_to_df(
10763                    query=query_annotation_fields
10764                )
10765
10766                # Add temporary view and annotation fields
10767                temporary_tables.append(temporary_view_name)
10768                annotation_fields += list(set(df_annotation_fields["column_name"]))
10769
10770        return temporary_tables, annotation_fields
10771
10772    def create_transcript_view(
10773        self,
10774        transcripts_table: str = None,
10775        transcripts_table_drop: bool = True,
10776        param: dict = {},
10777    ) -> str:
10778        """
10779        The `create_transcript_view` function generates a transcript view by processing data from a
10780        specified table based on provided parameters and structural information.
10781
10782        :param transcripts_table: The `transcripts_table` parameter in the `create_transcript_view` function
10783        is used to specify the name of the table that will store the final transcript view data. If a table
10784        name is not provided, the function will create a new table to store the transcript view data, and by
10785        default,, defaults to transcripts
10786        :type transcripts_table: str (optional)
10787        :param transcripts_table_drop: The `transcripts_table_drop` parameter in the
10788        `create_transcript_view` function is a boolean parameter that determines whether to drop the
10789        existing transcripts table before creating a new one. If `transcripts_table_drop` is set to `True`,
10790        the function will drop the existing transcripts table if it exists, defaults to True
10791        :type transcripts_table_drop: bool (optional)
10792        :param param: The `param` parameter in the `create_transcript_view` function is a dictionary that
10793        contains information needed to create a transcript view. It includes details such as the structure
10794        of the transcripts, columns mapping, column formats, and other necessary information for generating
10795        the view. This parameter allows for flexibility and customization
10796        :type param: dict
10797        :return: The `create_transcript_view` function returns the name of the transcripts table that was
10798        created or modified during the execution of the function.
10799        """
10800
10801        log.debug("Start transcripts view creation...")
10802
10803        # Default
10804        transcripts_table_default = "transcripts"
10805
10806        # Param
10807        if not param:
10808            param = self.get_param()
10809
10810        # Struct
10811        struct = param.get("transcripts", {}).get("struct", None)
10812
10813        # Transcript veresion
10814        transcript_id_remove_version = param.get("transcripts", {}).get(
10815            "transcript_id_remove_version", False
10816        )
10817
10818        # Transcripts mapping
10819        transcript_id_mapping_file = param.get("transcripts", {}).get(
10820            "transcript_id_mapping_file", None
10821        )
10822
10823        # Transcripts mapping
10824        transcript_id_mapping_force = param.get("transcripts", {}).get(
10825            "transcript_id_mapping_force", None
10826        )
10827
10828        if struct:
10829
10830            # Transcripts table
10831            if transcripts_table is None:
10832                transcripts_table = param.get("transcripts", {}).get(
10833                    "table", transcripts_table_default
10834                )
10835
10836            # added_columns
10837            added_columns = []
10838
10839            # Temporary tables
10840            temporary_tables = []
10841
10842            # Annotation fields
10843            annotation_fields = []
10844
10845            # from columns map
10846            columns_maps = struct.get("from_columns_map", [])
10847            added_columns_tmp, temporary_tables_tmp, annotation_fields_tmp = (
10848                self.create_transcript_view_from_columns_map(
10849                    transcripts_table=transcripts_table,
10850                    columns_maps=columns_maps,
10851                    added_columns=added_columns,
10852                    temporary_tables=temporary_tables,
10853                    annotation_fields=annotation_fields,
10854                )
10855            )
10856            added_columns += added_columns_tmp
10857            temporary_tables += temporary_tables_tmp
10858            annotation_fields += annotation_fields_tmp
10859
10860            # from column format
10861            column_formats = struct.get("from_column_format", [])
10862            temporary_tables_tmp, annotation_fields_tmp = (
10863                self.create_transcript_view_from_column_format(
10864                    transcripts_table=transcripts_table,
10865                    column_formats=column_formats,
10866                    temporary_tables=temporary_tables,
10867                    annotation_fields=annotation_fields,
10868                )
10869            )
10870            temporary_tables += temporary_tables_tmp
10871            annotation_fields += annotation_fields_tmp
10872
10873            # Remove some specific fields/column
10874            annotation_fields = list(set(annotation_fields))
10875            for field in ["#CHROM", "POS", "REF", "ALT", "INFO", "transcript"]:
10876                if field in annotation_fields:
10877                    annotation_fields.remove(field)
10878
10879            # Merge temporary tables query
10880            query_merge = ""
10881            for temporary_table in list(set(temporary_tables)):
10882
10883                # First temporary table
10884                if not query_merge:
10885                    query_merge = f"""
10886                        SELECT * FROM {temporary_table}
10887                    """
10888                # other temporary table (using UNION)
10889                else:
10890                    query_merge += f"""
10891                        UNION BY NAME SELECT * FROM {temporary_table}
10892                    """
10893
10894            # transcript table tmp
10895            transcript_table_tmp = "transcripts_tmp"
10896            transcript_table_tmp2 = "transcripts_tmp2"
10897            transcript_table_tmp3 = "transcripts_tmp3"
10898
10899            # Merge on transcript
10900            query_merge_on_transcripts_annotation_fields = []
10901
10902            # Add transcript list
10903            query_merge_on_transcripts_annotation_fields.append(
10904                f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.transcript)), 'string_agg', ',') AS transcript_list """
10905            )
10906
10907            # Aggregate all annotations fields
10908            for annotation_field in set(annotation_fields):
10909                query_merge_on_transcripts_annotation_fields.append(
10910                    f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.{annotation_field})), 'string_agg', ',') AS {annotation_field} """
10911                )
10912
10913            # Transcripts mapping
10914            if transcript_id_mapping_file:
10915
10916                # Transcript dataframe
10917                transcript_id_mapping_dataframe_name = "transcript_id_mapping_dataframe"
10918                transcript_id_mapping_dataframe = transcripts_file_to_df(
10919                    transcript_id_mapping_file, column_names=["transcript", "alias"]
10920                )
10921
10922                # Transcript version remove
10923                if transcript_id_remove_version:
10924                    query_transcript_column_select = f"split_part({transcript_table_tmp}.transcript, '.', 1) AS transcript_original, split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1) AS transcript_mapped"
10925                    query_transcript_column_group_by = f"split_part({transcript_table_tmp}.transcript, '.', 1), split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1)"
10926                    query_left_join = f"""
10927                        LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1))
10928                    """
10929                else:
10930                    query_transcript_column_select = f"{transcript_table_tmp}.transcript AS transcript_original, {transcript_id_mapping_dataframe_name}.transcript AS transcript_mapped"
10931                    query_transcript_column_group_by = f"{transcript_table_tmp}.transcript, {transcript_id_mapping_dataframe_name}.transcript"
10932                    query_left_join = f"""
10933                        LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1))
10934                    """
10935
10936                # Transcript column for group by merge
10937                query_transcript_merge_group_by = """
10938                        CASE
10939                            WHEN transcript_mapped NOT IN ('')
10940                            THEN split_part(transcript_mapped, '.', 1)
10941                            ELSE split_part(transcript_original, '.', 1)
10942                        END
10943                    """
10944
10945                # Merge query
10946                transcripts_tmp2_query = f"""
10947                    SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_select}, {", ".join(query_merge_on_transcripts_annotation_fields)}
10948                    FROM ({query_merge}) AS {transcript_table_tmp}
10949                    {query_left_join}
10950                    GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_group_by}
10951                """
10952
10953                # Retrive columns after mege
10954                transcripts_tmp2_describe_query = f"""
10955                    DESCRIBE {transcripts_tmp2_query}
10956                """
10957                transcripts_tmp2_describe_list = list(
10958                    self.get_query_to_df(query=transcripts_tmp2_describe_query)[
10959                        "column_name"
10960                    ]
10961                )
10962
10963                # Create list of columns for select clause
10964                transcripts_tmp2_describe_select_clause = []
10965                for field in transcripts_tmp2_describe_list:
10966                    if field not in [
10967                        "#CHROM",
10968                        "POS",
10969                        "REF",
10970                        "ALT",
10971                        "INFO",
10972                        "transcript_mapped",
10973                    ]:
10974                        as_field = field
10975                        if field in ["transcript_original"]:
10976                            as_field = "transcripts_mapped"
10977                        transcripts_tmp2_describe_select_clause.append(
10978                            f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp2}.{field})), 'string_agg', ',') AS {as_field} """
10979                        )
10980
10981                # Merge with mapping
10982                query_merge_on_transcripts = f"""
10983                    SELECT
10984                        "#CHROM", POS, REF, ALT, INFO,
10985                        CASE
10986                            WHEN ANY_VALUE(transcript_mapped) NOT IN ('')
10987                            THEN ANY_VALUE(transcript_mapped)
10988                            ELSE ANY_VALUE(transcript_original)
10989                        END AS transcript,
10990                        {", ".join(transcripts_tmp2_describe_select_clause)}
10991                    FROM ({transcripts_tmp2_query}) AS {transcript_table_tmp2}
10992                    GROUP BY "#CHROM", POS, REF, ALT, INFO,
10993                        {query_transcript_merge_group_by}
10994                """
10995
10996                # Add transcript filter from mapping file
10997                if transcript_id_mapping_force:
10998                    query_merge_on_transcripts = f"""
10999                        SELECT *
11000                        FROM ({query_merge_on_transcripts}) AS {transcript_table_tmp3}
11001                        WHERE split_part({transcript_table_tmp3}.transcript, '.', 1) in (SELECT split_part(transcript, '.', 1) FROM transcript_id_mapping_dataframe)
11002                    """
11003
11004            # No transcript mapping
11005            else:
11006
11007                # Remove transcript version
11008                if transcript_id_remove_version:
11009                    query_transcript_column = f"""
11010                        split_part({transcript_table_tmp}.transcript, '.', 1)
11011                    """
11012                else:
11013                    query_transcript_column = """
11014                        transcript
11015                    """
11016
11017                # Query sections
11018                query_transcript_column_select = (
11019                    f"{query_transcript_column} AS transcript"
11020                )
11021                query_transcript_column_group_by = query_transcript_column
11022
11023                # Query for transcripts view
11024                query_merge_on_transcripts = f"""
11025                    SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column} AS transcript, NULL AS transcript_mapped, {", ".join(query_merge_on_transcripts_annotation_fields)}
11026                    FROM ({query_merge}) AS {transcript_table_tmp}
11027                    GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column}
11028                """
11029
11030            log.debug(f"query_merge_on_transcripts={query_merge_on_transcripts}")
11031
11032            # Drop transcript view is necessary
11033            if transcripts_table_drop:
11034                query_drop = f"""
11035                    DROP TABLE IF EXISTS {transcripts_table};
11036                """
11037                self.execute_query(query=query_drop)
11038
11039            # Merge and create transcript view
11040            query_create_view = f"""
11041                CREATE TABLE IF NOT EXISTS {transcripts_table}
11042                AS {query_merge_on_transcripts}
11043            """
11044            self.execute_query(query=query_create_view)
11045
11046            # Remove added columns
11047            for added_column in added_columns:
11048                self.drop_column(column=added_column)
11049
11050        else:
11051
11052            transcripts_table = None
11053
11054        return transcripts_table
11055
11056    def annotation_format_to_table(
11057        self,
11058        uniquify: bool = True,
11059        annotation_field: str = "ANN",
11060        annotation_id: str = "Feature_ID",
11061        view_name: str = "transcripts",
11062        column_rename: dict = {},
11063        column_clean: bool = False,
11064        column_case: str = None,
11065    ) -> str:
11066        """
11067        The `annotation_format_to_table` function converts annotation data from a VCF file into a
11068        structured table format, ensuring unique values and creating a temporary table for further
11069        processing or analysis.
11070
11071        :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure
11072        unique values in the output or not. If set to `True`, the function will make sure that the
11073        output values are unique, defaults to True
11074        :type uniquify: bool (optional)
11075        :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file
11076        that contains the annotation information for each variant. This field is used to extract the
11077        annotation details for further processing in the function. By default, it is set to "ANN",
11078        defaults to ANN
11079        :type annotation_field: str (optional)
11080        :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method
11081        is used to specify the identifier for the annotation feature. This identifier will be used as a
11082        column name in the resulting table or view that is created based on the annotation data. It
11083        helps in uniquely identifying each annotation entry in the, defaults to Feature_ID
11084        :type annotation_id: str (optional)
11085        :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used
11086        to specify the name of the temporary table that will be created to store the transformed
11087        annotation data. This table will hold the extracted information from the annotation field in a
11088        structured format for further processing or analysis. By default,, defaults to transcripts
11089        :type view_name: str (optional)
11090        :param column_rename: The `column_rename` parameter in the `annotation_format_to_table` method
11091        is a dictionary that allows you to specify custom renaming for columns. By providing key-value
11092        pairs in this dictionary, you can rename specific columns in the resulting table or view that is
11093        created based on the annotation data. This feature enables
11094        :type column_rename: dict
11095        :param column_clean: The `column_clean` parameter in the `annotation_format_to_table` method is
11096        a boolean flag that determines whether the annotation field should undergo a cleaning process.
11097        If set to `True`, the function will clean the annotation field before further processing. This
11098        cleaning step may involve removing any unwanted characters, formatting inconsistencies, defaults
11099        to False
11100        :type column_clean: bool (optional)
11101        :param column_case: The `column_case` parameter in the `annotation_format_to_table` method is
11102        used to specify the case transformation to be applied to the column names extracted from the
11103        annotation data. It allows you to set the case of the column names to either lowercase or
11104        uppercase for consistency or other specific requirements during the conversion
11105        :type column_case: str
11106        :return: The function `annotation_format_to_table` is returning the name of the view created,
11107        which is stored in the variable `view_name`.
11108        """
11109
11110        # Annotation field
11111        annotation_format = "annotation_explode"
11112
11113        # Transcript annotation
11114        if column_rename:
11115            annotation_id = column_rename.get(annotation_id, annotation_id)
11116
11117        if column_clean:
11118            annotation_id = clean_annotation_field(annotation_id)
11119
11120        # Prefix
11121        prefix = self.get_explode_infos_prefix()
11122        if prefix:
11123            prefix = "INFO/"
11124
11125        # Annotation fields
11126        annotation_infos = prefix + annotation_field
11127        annotation_format_infos = prefix + annotation_format
11128
11129        # Variants table
11130        table_variants = self.get_table_variants()
11131
11132        # Header
11133        vcf_reader = self.get_header()
11134
11135        # Add columns
11136        added_columns = []
11137
11138        # Explode HGVS field in column
11139        added_columns += self.explode_infos(fields=[annotation_field])
11140
11141        if annotation_field in vcf_reader.infos:
11142
11143            # Extract ANN header
11144            ann_description = vcf_reader.infos[annotation_field].desc
11145            pattern = r"'(.+?)'"
11146            match = re.search(pattern, ann_description)
11147            if match:
11148                ann_header_match = match.group(1).split(" | ")
11149                ann_header = []
11150                ann_header_desc = {}
11151                for i in range(len(ann_header_match)):
11152                    ann_header_info = "".join(
11153                        char for char in ann_header_match[i] if char.isalnum()
11154                    )
11155                    ann_header.append(ann_header_info)
11156                    ann_header_desc[ann_header_info] = ann_header_match[i]
11157                if not ann_header_desc:
11158                    raise ValueError("Invalid header description format")
11159            else:
11160                raise ValueError("Invalid header description format")
11161
11162            # Create variant id
11163            variant_id_column = self.get_variant_id_column()
11164            added_columns += [variant_id_column]
11165
11166            # Create dataframe
11167            dataframe_annotation_format = self.get_query_to_df(
11168                f""" SELECT "#CHROM", POS, REF, ALT, INFO, "{variant_id_column}", "{annotation_infos}" FROM {table_variants} """
11169            )
11170
11171            # Create annotation columns
11172            dataframe_annotation_format[
11173                annotation_format_infos
11174            ] = dataframe_annotation_format[annotation_infos].apply(
11175                lambda x: explode_annotation_format(
11176                    annotation=str(x),
11177                    uniquify=uniquify,
11178                    output_format="JSON",
11179                    prefix="",
11180                    header=list(ann_header_desc.values()),
11181                )
11182            )
11183
11184            # Find keys
11185            query_json = f"""SELECT distinct(unnest(json_keys({annotation_format}, '$.0'))) AS 'key' FROM dataframe_annotation_format;"""
11186            df_keys = self.get_query_to_df(query=query_json)
11187
11188            # Check keys
11189            query_json_key = []
11190            for _, row in df_keys.iterrows():
11191
11192                # Key
11193                key = row.iloc[0]
11194                key_clean = key
11195
11196                # key rename
11197                if column_rename:
11198                    key_clean = column_rename.get(key_clean, key_clean)
11199
11200                # key clean
11201                if column_clean:
11202                    key_clean = clean_annotation_field(key_clean)
11203
11204                # Key case
11205                if column_case:
11206                    if column_case.lower() in ["lower"]:
11207                        key_clean = key_clean.lower()
11208                    elif column_case.lower() in ["upper"]:
11209                        key_clean = key_clean.upper()
11210
11211                # Type
11212                query_json_type = f"""SELECT unnest(json_extract_string({annotation_format}, '$.*."{key}"')) AS '{key_clean}' FROM dataframe_annotation_format WHERE trim('{key}') NOT IN ('');"""
11213
11214                # Get DataFrame from query
11215                df_json_type = self.get_query_to_df(query=query_json_type)
11216
11217                # Fill missing values with empty strings and then replace empty strings or None with NaN and drop rows with NaN
11218                with pd.option_context("future.no_silent_downcasting", True):
11219                    df_json_type.fillna(value="", inplace=True)
11220                    replace_dict = {None: np.nan, "": np.nan}
11221                    df_json_type.replace(replace_dict, inplace=True)
11222                    df_json_type.dropna(inplace=True)
11223
11224                # Detect column type
11225                column_type = detect_column_type(df_json_type[key_clean])
11226
11227                # Append
11228                query_json_key.append(
11229                    f"""NULLIF(unnest(json_extract_string({annotation_format}, '$.*."{key}"')), '')::{column_type}  AS '{prefix}{key_clean}' """
11230                )
11231
11232            # Create view
11233            query_view = f"""
11234                CREATE TEMPORARY TABLE {view_name}
11235                AS (
11236                    SELECT *, {annotation_id} AS 'transcript'
11237                    FROM (
11238                        SELECT "#CHROM", POS, REF, ALT, INFO, {",".join(query_json_key)}
11239                        FROM dataframe_annotation_format
11240                        )
11241                    );
11242            """
11243            self.execute_query(query=query_view)
11244
11245        else:
11246
11247            # Return None
11248            view_name = None
11249
11250        # Remove added columns
11251        for added_column in added_columns:
11252            self.drop_column(column=added_column)
11253
11254        return view_name
11255
11256    def transcript_view_to_variants(
11257        self,
11258        transcripts_table: str = None,
11259        transcripts_column_id: str = None,
11260        transcripts_info_json: str = None,
11261        transcripts_info_field_json: str = None,
11262        transcripts_info_format: str = None,
11263        transcripts_info_field_format: str = None,
11264        param: dict = {},
11265    ) -> bool:
11266        """
11267        The `transcript_view_to_variants` function updates a variants table with information from
11268        transcripts in JSON format.
11269
11270        :param transcripts_table: The `transcripts_table` parameter is used to specify the name of the
11271        table containing the transcripts data. If this parameter is not provided, the function will
11272        attempt to retrieve it from the `param` dictionary or use a default value of "transcripts"
11273        :type transcripts_table: str
11274        :param transcripts_column_id: The `transcripts_column_id` parameter is used to specify the
11275        column in the `transcripts_table` that contains the unique identifier for each transcript. This
11276        identifier is used to match transcripts with variants in the database
11277        :type transcripts_column_id: str
11278        :param transcripts_info_json: The `transcripts_info_json` parameter is used to specify the name
11279        of the column in the variants table where the transcripts information will be stored in JSON
11280        format. This parameter allows you to define the column in the variants table that will hold the
11281        JSON-formatted information about transcripts
11282        :type transcripts_info_json: str
11283        :param transcripts_info_field_json: The `transcripts_info_field_json` parameter is used to
11284        specify the field in the VCF header that will contain information about transcripts in JSON
11285        format. This field will be added to the VCF header as an INFO field with the specified name
11286        :type transcripts_info_field_json: str
11287        :param transcripts_info_format: The `transcripts_info_format` parameter is used to specify the
11288        format of the information about transcripts that will be stored in the variants table. This
11289        format can be used to define how the transcript information will be structured or displayed
11290        within the variants table
11291        :type transcripts_info_format: str
11292        :param transcripts_info_field_format: The `transcripts_info_field_format` parameter is used to
11293        specify the field in the VCF header that will contain information about transcripts in a
11294        specific format. This field will be added to the VCF header as an INFO field with the specified
11295        name
11296        :type transcripts_info_field_format: str
11297        :param param: The `param` parameter in the `transcript_view_to_variants` method is a dictionary
11298        that contains various configuration settings related to transcripts. It is used to provide
11299        default values for certain parameters if they are not explicitly provided when calling the
11300        method. The `param` dictionary can be passed as an argument
11301        :type param: dict
11302        :return: The function `transcript_view_to_variants` returns a boolean value. It returns `True`
11303        if the operation is successful and `False` if certain conditions are not met.
11304        """
11305
11306        msg_info_prefix = "Start transcripts view to variants annotations"
11307
11308        log.debug(f"{msg_info_prefix}...")
11309
11310        # Default
11311        transcripts_table_default = "transcripts"
11312        transcripts_column_id_default = "transcript"
11313        transcripts_info_json_default = None
11314        transcripts_info_format_default = None
11315        transcripts_info_field_json_default = None
11316        transcripts_info_field_format_default = None
11317
11318        # Param
11319        if not param:
11320            param = self.get_param()
11321
11322        # Transcripts table
11323        if transcripts_table is None:
11324            transcripts_table = param.get("transcripts", {}).get(
11325                "table", transcripts_table_default
11326            )
11327
11328        # Transcripts column ID
11329        if transcripts_column_id is None:
11330            transcripts_column_id = param.get("transcripts", {}).get(
11331                "column_id", transcripts_column_id_default
11332            )
11333
11334        # Transcripts info json
11335        if transcripts_info_json is None:
11336            transcripts_info_json = param.get("transcripts", {}).get(
11337                "transcripts_info_json", transcripts_info_json_default
11338            )
11339
11340        # Transcripts info field JSON
11341        if transcripts_info_field_json is None:
11342            transcripts_info_field_json = param.get("transcripts", {}).get(
11343                "transcripts_info_field_json", transcripts_info_field_json_default
11344            )
11345        # if transcripts_info_field_json is not None and transcripts_info_json is None:
11346        #     transcripts_info_json = transcripts_info_field_json
11347
11348        # Transcripts info format
11349        if transcripts_info_format is None:
11350            transcripts_info_format = param.get("transcripts", {}).get(
11351                "transcripts_info_format", transcripts_info_format_default
11352            )
11353
11354        # Transcripts info field FORMAT
11355        if transcripts_info_field_format is None:
11356            transcripts_info_field_format = param.get("transcripts", {}).get(
11357                "transcripts_info_field_format", transcripts_info_field_format_default
11358            )
11359        # if (
11360        #     transcripts_info_field_format is not None
11361        #     and transcripts_info_format is None
11362        # ):
11363        #     transcripts_info_format = transcripts_info_field_format
11364
11365        # Variants table
11366        table_variants = self.get_table_variants()
11367
11368        # Check info columns param
11369        if (
11370            transcripts_info_json is None
11371            and transcripts_info_field_json is None
11372            and transcripts_info_format is None
11373            and transcripts_info_field_format is None
11374        ):
11375            return False
11376
11377        # Transcripts infos columns
11378        query_transcripts_infos_columns = f"""
11379            SELECT *
11380            FROM (
11381                DESCRIBE SELECT * FROM {transcripts_table}
11382                )
11383            WHERE "column_name" NOT IN ('#CHROM', 'POS', 'REF', 'ALT', '{transcripts_column_id}')
11384        """
11385        transcripts_infos_columns = list(
11386            self.get_query_to_df(query=query_transcripts_infos_columns)["column_name"]
11387        )
11388
11389        # View results
11390        clause_select = []
11391        clause_to_json = []
11392        clause_to_format = []
11393        for field in transcripts_infos_columns:
11394            # Do not consider INFO field for export into fields
11395            if field not in ["INFO"]:
11396                clause_select.append(
11397                    f""" regexp_split_to_table(CAST("{field}" AS STRING), ',') AS '{field}' """
11398                )
11399                clause_to_json.append(f""" '{field}': "{field}" """)
11400                clause_to_format.append(f""" "{field}" """)
11401
11402        # Update
11403        update_set_json = []
11404        update_set_format = []
11405
11406        # VCF header
11407        vcf_reader = self.get_header()
11408
11409        # Transcripts to info column in JSON
11410        if transcripts_info_json:
11411
11412            # Create column on variants table
11413            self.add_column(
11414                table_name=table_variants,
11415                column_name=transcripts_info_json,
11416                column_type="JSON",
11417                default_value=None,
11418                drop=False,
11419            )
11420
11421            # Add header
11422            vcf_reader.infos[transcripts_info_json] = vcf.parser._Info(
11423                transcripts_info_json,
11424                ".",
11425                "String",
11426                "Transcripts in JSON format",
11427                "unknwon",
11428                "unknwon",
11429                self.code_type_map["String"],
11430            )
11431
11432            # Add to update
11433            update_set_json.append(
11434                f""" {transcripts_info_json}=t.{transcripts_info_json} """
11435            )
11436
11437        # Transcripts to info field in JSON
11438        if transcripts_info_field_json:
11439
11440            log.debug(f"{msg_info_prefix} - Annotation in JSON format...")
11441
11442            # Add to update
11443            update_set_json.append(
11444                f""" 
11445                    INFO = concat(
11446                            CASE
11447                                WHEN INFO NOT IN ('', '.')
11448                                THEN INFO
11449                                ELSE ''
11450                            END,
11451                            CASE
11452                                WHEN CAST(t.{transcripts_info_json} AS VARCHAR) NOT IN ('', '.')
11453                                THEN concat(
11454                                    ';{transcripts_info_field_json}=',
11455                                    t.{transcripts_info_json}
11456                                )
11457                                ELSE ''
11458                            END
11459                            )
11460                """
11461            )
11462
11463            # Add header
11464            vcf_reader.infos[transcripts_info_field_json] = vcf.parser._Info(
11465                transcripts_info_field_json,
11466                ".",
11467                "String",
11468                "Transcripts in JSON format",
11469                "unknwon",
11470                "unknwon",
11471                self.code_type_map["String"],
11472            )
11473
11474        if update_set_json:
11475
11476            # Update query
11477            query_update = f"""
11478                UPDATE {table_variants}
11479                    SET {", ".join(update_set_json)}
11480                FROM
11481                (
11482                    SELECT
11483                        "#CHROM", POS, REF, ALT,
11484                            concat(
11485                            '{{',
11486                            string_agg(
11487                                '"' || "{transcripts_column_id}" || '":' ||
11488                                to_json(json_output)
11489                            ),
11490                            '}}'
11491                            )::JSON AS {transcripts_info_json}
11492                    FROM
11493                        (
11494                        SELECT
11495                            "#CHROM", POS, REF, ALT,
11496                            "{transcripts_column_id}",
11497                            to_json(
11498                                {{{",".join(clause_to_json)}}}
11499                            )::JSON AS json_output
11500                        FROM
11501                            (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table})
11502                        WHERE "{transcripts_column_id}" IS NOT NULL
11503                        )
11504                    GROUP BY "#CHROM", POS, REF, ALT
11505                ) AS t
11506                WHERE {table_variants}."#CHROM" = t."#CHROM"
11507                    AND {table_variants}."POS" = t."POS"
11508                    AND {table_variants}."REF" = t."REF"
11509                    AND {table_variants}."ALT" = t."ALT"
11510            """
11511
11512            self.execute_query(query=query_update)
11513
11514        # Transcripts to info column in FORMAT
11515        if transcripts_info_format:
11516
11517            # Create column on variants table
11518            self.add_column(
11519                table_name=table_variants,
11520                column_name=transcripts_info_format,
11521                column_type="VARCHAR",
11522                default_value=None,
11523                drop=False,
11524            )
11525
11526            # Add header
11527            vcf_reader.infos[transcripts_info_format] = vcf.parser._Info(
11528                transcripts_info_format,
11529                ".",
11530                "String",
11531                f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'",
11532                "unknwon",
11533                "unknwon",
11534                self.code_type_map["String"],
11535            )
11536
11537            # Add to update
11538            update_set_format.append(
11539                f""" {transcripts_info_format}=t.{transcripts_info_format} """
11540            )
11541
11542        else:
11543
11544            # Set variable for internal queries
11545            transcripts_info_format = "transcripts_info_format"
11546
11547        # Transcripts to info field in JSON
11548        if transcripts_info_field_format:
11549
11550            log.debug(f"{msg_info_prefix} - Annotation in structured format...")
11551
11552            # Add to update
11553            update_set_format.append(
11554                f""" 
11555                    INFO = concat(
11556                            CASE
11557                                WHEN INFO NOT IN ('', '.')
11558                                THEN INFO
11559                                ELSE ''
11560                            END,
11561                            CASE
11562                                WHEN CAST(t.{transcripts_info_format} AS VARCHAR) NOT IN ('', '.')
11563                                THEN concat(
11564                                    ';{transcripts_info_field_format}=',
11565                                    t.{transcripts_info_format}
11566                                )
11567                                ELSE ''
11568                            END
11569                            )
11570                """
11571            )
11572
11573            # Add header
11574            vcf_reader.infos[transcripts_info_field_format] = vcf.parser._Info(
11575                transcripts_info_field_format,
11576                ".",
11577                "String",
11578                f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'",
11579                "unknwon",
11580                "unknwon",
11581                self.code_type_map["String"],
11582            )
11583
11584        if update_set_format:
11585
11586            # Update query
11587            query_update = f"""
11588                UPDATE {table_variants}
11589                    SET {", ".join(update_set_format)}
11590                FROM
11591                (
11592                    SELECT
11593                        "#CHROM", POS, REF, ALT,
11594                            string_agg({transcripts_info_format}) AS {transcripts_info_format}
11595                    FROM 
11596                        (
11597                        SELECT
11598                            "#CHROM", POS, REF, ALT,
11599                            "{transcripts_column_id}",
11600                            concat(
11601                                "{transcripts_column_id}",
11602                                '|',
11603                                {", '|', ".join(clause_to_format)}
11604                            ) AS {transcripts_info_format}
11605                        FROM
11606                            (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table})
11607                        )
11608                    GROUP BY "#CHROM", POS, REF, ALT
11609                ) AS t
11610                WHERE {table_variants}."#CHROM" = t."#CHROM"
11611                    AND {table_variants}."POS" = t."POS"
11612                    AND {table_variants}."REF" = t."REF"
11613                    AND {table_variants}."ALT" = t."ALT"
11614            """
11615
11616            self.execute_query(query=query_update)
11617
11618        return True
class Variants:
   36class Variants:
   37
   38    def __init__(
   39        self,
   40        conn=None,
   41        input: str = None,
   42        output: str = None,
   43        config: dict = {},
   44        param: dict = {},
   45        load: bool = False,
   46    ) -> None:
   47        """
   48        The function `__init__` initializes the variables, sets the input, output, config, param, connexion and
   49        header
   50
   51        :param conn: the connection to the database
   52        :param input: the input file
   53        :param output: the output file
   54        :param config: a dictionary containing the configuration of the model
   55        :param param: a dictionary containing the parameters of the model
   56        """
   57
   58        # Init variables
   59        self.init_variables()
   60
   61        # Input
   62        self.set_input(input)
   63
   64        # Config
   65        self.set_config(config)
   66
   67        # Param
   68        self.set_param(param)
   69
   70        # Output
   71        self.set_output(output)
   72
   73        # connexion
   74        self.set_connexion(conn)
   75
   76        # Header
   77        self.set_header()
   78
   79        # Samples
   80        self.set_samples()
   81
   82        # Load data
   83        if load:
   84            self.load_data()
   85
   86    def set_samples(self, samples: list = None) -> list:
   87        """
   88        The function `set_samples` sets the samples attribute of an object to a provided list or
   89        retrieves it from a parameter dictionary.
   90
   91        :param samples: The `set_samples` method is a method of a class that takes a list of samples as
   92        input and sets the `samples` attribute of the class to the provided list. If no samples are
   93        provided, it tries to get the samples from the class's parameters using the `get_param` method
   94        :type samples: list
   95        :return: The `samples` list is being returned.
   96        """
   97
   98        if not samples:
   99            samples = self.get_param().get("samples", {}).get("list", None)
  100
  101        self.samples = samples
  102
  103        return samples
  104
  105    def get_samples(self) -> list:
  106        """
  107        This function returns a list of samples.
  108        :return: The `get_samples` method is returning the `samples` attribute of the object.
  109        """
  110
  111        return self.samples
  112
  113    def get_samples_check(self) -> bool:
  114        """
  115        This function returns the value of the "check" key within the "samples" dictionary retrieved
  116        from the parameters.
  117        :return: The method `get_samples_check` is returning the value of the key "check" inside the
  118        "samples" dictionary, which is nested inside the dictionary returned by the `get_param()`
  119        method. If the key "check" is not found, it will return `False`.
  120        """
  121
  122        return self.get_param().get("samples", {}).get("check", True)
  123
  124    def set_input(self, input: str = None) -> None:
  125        """
  126        The function `set_input` takes a file name as input, extracts the name and extension, and sets
  127        attributes in the class accordingly.
  128
  129        :param input: The `set_input` method in the provided code snippet is used to set attributes
  130        related to the input file. Here's a breakdown of the parameters and their usage in the method:
  131        :type input: str
  132        """
  133
  134        if input and not isinstance(input, str):
  135            try:
  136                self.input = input.name
  137            except:
  138                log.error(f"Input file '{input} in bad format")
  139                raise ValueError(f"Input file '{input} in bad format")
  140        else:
  141            self.input = input
  142
  143        # Input format
  144        if input:
  145            input_name, input_extension = os.path.splitext(self.input)
  146            self.input_name = input_name
  147            self.input_extension = input_extension
  148            self.input_format = self.input_extension.replace(".", "")
  149
  150    def set_config(self, config: dict) -> None:
  151        """
  152        The set_config function takes a config object and assigns it as the configuration object for the
  153        class.
  154
  155        :param config: The `config` parameter in the `set_config` function is a dictionary object that
  156        contains configuration settings for the class. When you call the `set_config` function with a
  157        dictionary object as the argument, it will set that dictionary as the configuration object for
  158        the class
  159        :type config: dict
  160        """
  161
  162        self.config = config
  163
  164    def set_param(self, param: dict) -> None:
  165        """
  166        This function sets a parameter object for the class based on the input dictionary.
  167
  168        :param param: The `set_param` method you provided takes a dictionary object as input and sets it
  169        as the `param` attribute of the class instance
  170        :type param: dict
  171        """
  172
  173        self.param = param
  174
  175    def init_variables(self) -> None:
  176        """
  177        This function initializes the variables that will be used in the rest of the class
  178        """
  179
  180        self.prefix = "howard"
  181        self.table_variants = "variants"
  182        self.dataframe = None
  183
  184        self.comparison_map = {
  185            "gt": ">",
  186            "gte": ">=",
  187            "lt": "<",
  188            "lte": "<=",
  189            "equals": "=",
  190            "contains": "SIMILAR TO",
  191        }
  192
  193        self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3}
  194
  195        self.code_type_map_to_sql = {
  196            "Integer": "INTEGER",
  197            "String": "VARCHAR",
  198            "Float": "FLOAT",
  199            "Flag": "VARCHAR",
  200        }
  201
  202        self.index_additionnal_fields = []
  203
  204    def get_indexing(self) -> bool:
  205        """
  206        It returns the value of the key "indexing" in the dictionary. If the key is not present, it
  207        returns False.
  208        :return: The value of the indexing parameter.
  209        """
  210
  211        return self.get_param().get("indexing", False)
  212
  213    def get_connexion_config(self) -> dict:
  214        """
  215        The function `get_connexion_config` returns a dictionary containing the configuration for a
  216        connection, including the number of threads and memory limit.
  217        :return: a dictionary containing the configuration for the Connexion library.
  218        """
  219
  220        # config
  221        config = self.get_config()
  222
  223        # Connexion config
  224        connexion_config = {}
  225        threads = self.get_threads()
  226
  227        # Threads
  228        if threads:
  229            connexion_config["threads"] = threads
  230
  231        # Memory
  232        # if config.get("memory", None):
  233        #     connexion_config["memory_limit"] = config.get("memory")
  234        if self.get_memory():
  235            connexion_config["memory_limit"] = self.get_memory()
  236
  237        # Temporary directory
  238        if config.get("tmp", None):
  239            connexion_config["temp_directory"] = config.get("tmp")
  240
  241        # Access
  242        if config.get("access", None):
  243            access = config.get("access")
  244            if access in ["RO"]:
  245                access = "READ_ONLY"
  246            elif access in ["RW"]:
  247                access = "READ_WRITE"
  248            connexion_db = self.get_connexion_db()
  249            if connexion_db in ":memory:":
  250                access = "READ_WRITE"
  251            connexion_config["access_mode"] = access
  252
  253        return connexion_config
  254
  255    def get_duckdb_settings(self) -> dict:
  256        """
  257        The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a
  258        string.
  259        :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`.
  260        """
  261
  262        # config
  263        config = self.get_config()
  264
  265        # duckdb settings
  266        duckdb_settings_dict = {}
  267        if config.get("duckdb_settings", None):
  268            duckdb_settings = config.get("duckdb_settings")
  269            duckdb_settings = full_path(duckdb_settings)
  270            # duckdb setting is a file
  271            if os.path.exists(duckdb_settings):
  272                with open(duckdb_settings) as json_file:
  273                    duckdb_settings_dict = yaml.safe_load(json_file)
  274            # duckdb settings is a string
  275            else:
  276                duckdb_settings_dict = json.loads(duckdb_settings)
  277
  278        return duckdb_settings_dict
  279
  280    def set_connexion_db(self) -> str:
  281        """
  282        The function `set_connexion_db` returns the appropriate database connection string based on the
  283        input format and connection type.
  284        :return: the value of the variable `connexion_db`.
  285        """
  286
  287        # Default connexion db
  288        default_connexion_db = ":memory:"
  289
  290        # Find connexion db
  291        if self.get_input_format() in ["db", "duckdb"]:
  292            connexion_db = self.get_input()
  293        elif self.get_connexion_type() in ["memory", default_connexion_db, None]:
  294            connexion_db = default_connexion_db
  295        elif self.get_connexion_type() in ["tmpfile"]:
  296            tmp_name = tempfile.mkdtemp(
  297                prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db"
  298            )
  299            connexion_db = f"{tmp_name}/tmp.db"
  300        elif self.get_connexion_type() != "":
  301            connexion_db = self.get_connexion_type()
  302        else:
  303            connexion_db = default_connexion_db
  304
  305        # Set connexion db
  306        self.connexion_db = connexion_db
  307
  308        return connexion_db
  309
  310    def set_connexion(self, conn) -> None:
  311        """
  312        The function `set_connexion` creates a connection to a database, with options for different
  313        database formats and settings.
  314
  315        :param conn: The `conn` parameter in the `set_connexion` method is the connection to the
  316        database. If a connection is not provided, a new connection to an in-memory database is created.
  317        The method then proceeds to set up the connection based on the specified format (e.g., duckdb or
  318        sqlite
  319        """
  320
  321        # Connexion db
  322        connexion_db = self.set_connexion_db()
  323
  324        # Connexion config
  325        connexion_config = self.get_connexion_config()
  326
  327        # Connexion format
  328        connexion_format = self.get_config().get("connexion_format", "duckdb")
  329        # Set connexion format
  330        self.connexion_format = connexion_format
  331
  332        # Connexion
  333        if not conn:
  334            if connexion_format in ["duckdb"]:
  335                conn = duckdb.connect(connexion_db, config=connexion_config)
  336                # duckDB settings
  337                duckdb_settings = self.get_duckdb_settings()
  338                if duckdb_settings:
  339                    for setting in duckdb_settings:
  340                        setting_value = duckdb_settings.get(setting)
  341                        if isinstance(setting_value, str):
  342                            setting_value = f"'{setting_value}'"
  343                        conn.execute(f"PRAGMA {setting}={setting_value};")
  344            elif connexion_format in ["sqlite"]:
  345                conn = sqlite3.connect(connexion_db)
  346
  347        # Set connexion
  348        self.conn = conn
  349
  350        # Log
  351        log.debug(f"connexion_format: {connexion_format}")
  352        log.debug(f"connexion_db: {connexion_db}")
  353        log.debug(f"connexion config: {connexion_config}")
  354        log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}")
  355
  356    def set_output(self, output: str = None) -> None:
  357        """
  358        The `set_output` function in Python sets the output file based on the input or a specified key
  359        in the config file, extracting the output name, extension, and format.
  360
  361        :param output: The `output` parameter in the `set_output` method is used to specify the name of
  362        the output file. If the config file has an 'output' key, the method sets the output to the value
  363        of that key. If no output is provided, it sets the output to `None`
  364        :type output: str
  365        """
  366
  367        if output and not isinstance(output, str):
  368            self.output = output.name
  369        else:
  370            self.output = output
  371
  372        # Output format
  373        if self.output:
  374            output_name, output_extension = os.path.splitext(self.output)
  375            self.output_name = output_name
  376            self.output_extension = output_extension
  377            self.output_format = self.output_extension.replace(".", "")
  378        else:
  379            self.output_name = None
  380            self.output_extension = None
  381            self.output_format = None
  382
  383    def set_header(self) -> None:
  384        """
  385        It reads the header of a VCF file and stores it as a list of strings and as a VCF object
  386        """
  387
  388        input_file = self.get_input()
  389        default_header_list = [
  390            "##fileformat=VCFv4.2",
  391            "#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO",
  392        ]
  393
  394        # Full path
  395        input_file = full_path(input_file)
  396
  397        if input_file:
  398
  399            input_format = self.get_input_format()
  400            input_compressed = self.get_input_compressed()
  401            config = self.get_config()
  402            header_list = default_header_list
  403            if input_format in [
  404                "vcf",
  405                "hdr",
  406                "tsv",
  407                "csv",
  408                "psv",
  409                "parquet",
  410                "db",
  411                "duckdb",
  412            ]:
  413                # header provided in param
  414                if config.get("header_file", None):
  415                    with open(config.get("header_file"), "rt") as f:
  416                        header_list = self.read_vcf_header(f)
  417                # within a vcf file format (header within input file itsself)
  418                elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file):
  419                    # within a compressed vcf file format (.vcf.gz)
  420                    if input_compressed:
  421                        with bgzf.open(input_file, "rt") as f:
  422                            header_list = self.read_vcf_header(f)
  423                    # within an uncompressed vcf file format (.vcf)
  424                    else:
  425                        with open(input_file, "rt") as f:
  426                            header_list = self.read_vcf_header(f)
  427                # header provided in default external file .hdr
  428                elif os.path.exists((input_file + ".hdr")):
  429                    with open(input_file + ".hdr", "rt") as f:
  430                        header_list = self.read_vcf_header(f)
  431                else:
  432                    try:  # Try to get header info fields and file columns
  433
  434                        with tempfile.TemporaryDirectory() as tmpdir:
  435
  436                            # Create database
  437                            db_for_header = Database(database=input_file)
  438
  439                            # Get header columns for infos fields
  440                            db_header_from_columns = (
  441                                db_for_header.get_header_from_columns()
  442                            )
  443
  444                            # Get real columns in the file
  445                            db_header_columns = db_for_header.get_columns()
  446
  447                            # Write header file
  448                            header_file_tmp = os.path.join(tmpdir, "header")
  449                            f = open(header_file_tmp, "w")
  450                            vcf.Writer(f, db_header_from_columns)
  451                            f.close()
  452
  453                            # Replace #CHROM line with rel columns
  454                            header_list = db_for_header.read_header_file(
  455                                header_file=header_file_tmp
  456                            )
  457                            header_list[-1] = "\t".join(db_header_columns)
  458
  459                    except:
  460
  461                        log.warning(
  462                            f"No header for file {input_file}. Set as default VCF header"
  463                        )
  464                        header_list = default_header_list
  465
  466            else:  # try for unknown format ?
  467
  468                log.error(f"Input file format '{input_format}' not available")
  469                raise ValueError(f"Input file format '{input_format}' not available")
  470
  471            if not header_list:
  472                header_list = default_header_list
  473
  474            # header as list
  475            self.header_list = header_list
  476
  477            # header as VCF object
  478            self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list)))
  479
  480        else:
  481
  482            self.header_list = None
  483            self.header_vcf = None
  484
  485    def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame:
  486        """
  487        The `get_query_to_df` function takes a query as a string and returns the result as a pandas
  488        DataFrame based on the connection format.
  489
  490        :param query: The `query` parameter in the `get_query_to_df` function is a string that
  491        represents the SQL query you want to execute. This query will be used to fetch data from a
  492        database and convert it into a pandas DataFrame
  493        :type query: str
  494        :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the
  495        maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the
  496        function will only fetch up to that number of rows from the database query result. If no limit
  497        is specified,
  498        :type limit: int
  499        :return: A pandas DataFrame is being returned by the `get_query_to_df` function.
  500        """
  501
  502        # Connexion format
  503        connexion_format = self.get_connexion_format()
  504
  505        # Limit in query
  506        if limit:
  507            pd.set_option("display.max_rows", limit)
  508            if connexion_format in ["duckdb"]:
  509                df = (
  510                    self.conn.execute(query)
  511                    .fetch_record_batch(limit)
  512                    .read_next_batch()
  513                    .to_pandas()
  514                )
  515            elif connexion_format in ["sqlite"]:
  516                df = next(pd.read_sql_query(query, self.conn, chunksize=limit))
  517
  518        # Full query
  519        else:
  520            if connexion_format in ["duckdb"]:
  521                df = self.conn.execute(query).df()
  522            elif connexion_format in ["sqlite"]:
  523                df = pd.read_sql_query(query, self.conn)
  524
  525        return df
  526
  527    def get_overview(self) -> None:
  528        """
  529        The function prints the input, output, config, and dataframe of the current object
  530        """
  531        table_variants_from = self.get_table_variants(clause="from")
  532        sql_columns = self.get_header_columns_as_sql()
  533        sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}"
  534        df = self.get_query_to_df(sql_query_export)
  535        log.info(
  536            "Input:  "
  537            + str(self.get_input())
  538            + " ["
  539            + str(str(self.get_input_format()))
  540            + "]"
  541        )
  542        log.info(
  543            "Output: "
  544            + str(self.get_output())
  545            + " ["
  546            + str(str(self.get_output_format()))
  547            + "]"
  548        )
  549        log.info("Config: ")
  550        for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split(
  551            "\n"
  552        ):
  553            log.info("\t" + str(d))
  554        log.info("Param: ")
  555        for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split(
  556            "\n"
  557        ):
  558            log.info("\t" + str(d))
  559        log.info("Sample list: " + str(self.get_header_sample_list()))
  560        log.info("Dataframe: ")
  561        for d in str(df).split("\n"):
  562            log.info("\t" + str(d))
  563
  564        # garbage collector
  565        del df
  566        gc.collect()
  567
  568        return None
  569
  570    def get_stats(self) -> dict:
  571        """
  572        The `get_stats` function calculates and returns various statistics of the current object,
  573        including information about the input file, variants, samples, header fields, quality, and
  574        SNVs/InDels.
  575        :return: a dictionary containing various statistics of the current object. The dictionary has
  576        the following structure:
  577        """
  578
  579        # Log
  580        log.info(f"Stats Calculation...")
  581
  582        # table varaints
  583        table_variants_from = self.get_table_variants()
  584
  585        # stats dict
  586        stats = {"Infos": {}}
  587
  588        ### File
  589        input_file = self.get_input()
  590        stats["Infos"]["Input file"] = input_file
  591
  592        # Header
  593        header_infos = self.get_header().infos
  594        header_formats = self.get_header().formats
  595        header_infos_list = list(header_infos)
  596        header_formats_list = list(header_formats)
  597
  598        ### Variants
  599
  600        stats["Variants"] = {}
  601
  602        # Variants by chr
  603        sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"'
  604        df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom)
  605        nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values(
  606            by=["CHROM"], kind="quicksort"
  607        )
  608
  609        # Total number of variants
  610        nb_of_variants = nb_of_variants_by_chrom["count"].sum()
  611
  612        # Calculate percentage
  613        nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply(
  614            lambda x: (x / nb_of_variants)
  615        )
  616
  617        stats["Variants"]["Number of variants by chromosome"] = (
  618            nb_of_variants_by_chrom.to_dict(orient="index")
  619        )
  620
  621        stats["Infos"]["Number of variants"] = int(nb_of_variants)
  622
  623        ### Samples
  624
  625        # Init
  626        samples = {}
  627        nb_of_samples = 0
  628
  629        # Check Samples
  630        if "GT" in header_formats_list and "FORMAT" in self.get_header_columns():
  631            log.debug(f"Check samples...")
  632            for sample in self.get_header_sample_list():
  633                sql_query_samples = f"""
  634                    SELECT  '{sample}' as sample,
  635                            REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype,
  636                            count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count,
  637                            concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage
  638                    FROM {table_variants_from}
  639                    WHERE (
  640                        regexp_matches("{sample}", '^[0-9]([/|][0-9])+')
  641                        AND
  642                        len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':'))
  643                      )
  644                    GROUP BY genotype
  645                    """
  646                sql_query_genotype_df = self.conn.execute(sql_query_samples).df()
  647                sample_genotype_count = sql_query_genotype_df["count"].sum()
  648                if len(sql_query_genotype_df):
  649                    nb_of_samples += 1
  650                    samples[f"{sample} - {sample_genotype_count} variants"] = (
  651                        sql_query_genotype_df.to_dict(orient="index")
  652                    )
  653
  654            stats["Samples"] = samples
  655            stats["Infos"]["Number of samples"] = nb_of_samples
  656
  657        # #
  658        # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list:
  659        #     stats["Infos"]["Number of samples"] = nb_of_samples
  660        # elif nb_of_samples:
  661        #     stats["Infos"]["Number of samples"] = "not a VCF format"
  662
  663        ### INFO and FORMAT fields
  664        header_types_df = {}
  665        header_types_list = {
  666            "List of INFO fields": header_infos,
  667            "List of FORMAT fields": header_formats,
  668        }
  669        i = 0
  670        for header_type in header_types_list:
  671
  672            header_type_infos = header_types_list.get(header_type)
  673            header_infos_dict = {}
  674
  675            for info in header_type_infos:
  676
  677                i += 1
  678                header_infos_dict[i] = {}
  679
  680                # ID
  681                header_infos_dict[i]["id"] = info
  682
  683                # num
  684                genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"}
  685                if header_type_infos[info].num in genotype_map.keys():
  686                    header_infos_dict[i]["Number"] = genotype_map.get(
  687                        header_type_infos[info].num
  688                    )
  689                else:
  690                    header_infos_dict[i]["Number"] = header_type_infos[info].num
  691
  692                # type
  693                if header_type_infos[info].type:
  694                    header_infos_dict[i]["Type"] = header_type_infos[info].type
  695                else:
  696                    header_infos_dict[i]["Type"] = "."
  697
  698                # desc
  699                if header_type_infos[info].desc != None:
  700                    header_infos_dict[i]["Description"] = header_type_infos[info].desc
  701                else:
  702                    header_infos_dict[i]["Description"] = ""
  703
  704            if len(header_infos_dict):
  705                header_types_df[header_type] = pd.DataFrame.from_dict(
  706                    header_infos_dict, orient="index"
  707                ).to_dict(orient="index")
  708
  709        # Stats
  710        stats["Infos"]["Number of INFO fields"] = len(header_infos_list)
  711        stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list)
  712        stats["Header"] = header_types_df
  713
  714        ### QUAL
  715        if "QUAL" in self.get_header_columns():
  716            sql_query_qual = f"""
  717                    SELECT
  718                        avg(CAST(QUAL AS INTEGER)) AS Average,
  719                        min(CAST(QUAL AS INTEGER)) AS Minimum,
  720                        max(CAST(QUAL AS INTEGER)) AS Maximum,
  721                        stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation,
  722                        median(CAST(QUAL AS INTEGER)) AS Median,
  723                        variance(CAST(QUAL AS INTEGER)) AS Variance
  724                    FROM {table_variants_from}
  725                    WHERE CAST(QUAL AS VARCHAR) NOT IN ('.')
  726                    """
  727
  728            qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index")
  729            stats["Quality"] = {"Stats": qual}
  730
  731        ### SNV and InDel
  732
  733        sql_query_snv = f"""
  734            
  735            SELECT Type, count FROM (
  736
  737                    SELECT
  738                        'Total' AS Type,
  739                        count(*) AS count
  740                    FROM {table_variants_from}
  741
  742                    UNION
  743
  744                    SELECT
  745                        'MNV' AS Type,
  746                        count(*) AS count
  747                    FROM {table_variants_from}
  748                    WHERE len(REF) > 1 AND len(ALT) > 1
  749                    AND len(REF) = len(ALT)
  750
  751                    UNION
  752
  753                    SELECT
  754                        'InDel' AS Type,
  755                        count(*) AS count
  756                    FROM {table_variants_from}
  757                    WHERE len(REF) > 1 OR len(ALT) > 1
  758                    AND len(REF) != len(ALT)
  759                    
  760                    UNION
  761
  762                    SELECT
  763                        'SNV' AS Type,
  764                        count(*) AS count
  765                    FROM {table_variants_from}
  766                    WHERE len(REF) = 1 AND len(ALT) = 1
  767
  768                )
  769
  770            ORDER BY count DESC
  771
  772                """
  773        snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index")
  774
  775        sql_query_snv_substitution = f"""
  776                SELECT
  777                    concat(REF, '>', ALT) AS 'Substitution',
  778                    count(*) AS count
  779                FROM {table_variants_from}
  780                WHERE len(REF) = 1 AND len(ALT) = 1
  781                GROUP BY REF, ALT
  782                ORDER BY count(*) DESC
  783                """
  784        snv_substitution = (
  785            self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index")
  786        )
  787        stats["Variants"]["Counts"] = snv_indel
  788        stats["Variants"]["Substitutions"] = snv_substitution
  789
  790        return stats
  791
  792    def stats_to_file(self, file: str = None) -> str:
  793        """
  794        The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them
  795        into a JSON object, and writes the JSON object to the specified file.
  796
  797        :param file: The `file` parameter is a string that represents the file path where the JSON data
  798        will be written
  799        :type file: str
  800        :return: the name of the file that was written to.
  801        """
  802
  803        # Get stats
  804        stats = self.get_stats()
  805
  806        # Serializing json
  807        json_object = json.dumps(stats, indent=4)
  808
  809        # Writing to sample.json
  810        with open(file, "w") as outfile:
  811            outfile.write(json_object)
  812
  813        return file
  814
  815    def print_stats(self, output_file: str = None, json_file: str = None) -> None:
  816        """
  817        The `print_stats` function generates a markdown file and prints the statistics contained in a
  818        JSON file in a formatted manner.
  819
  820        :param output_file: The `output_file` parameter is a string that specifies the path and filename
  821        of the output file where the stats will be printed in Markdown format. If no `output_file` is
  822        provided, a temporary directory will be created and the stats will be saved in a file named
  823        "stats.md" within that
  824        :type output_file: str
  825        :param json_file: The `json_file` parameter is a string that represents the path to the JSON
  826        file where the statistics will be saved. If no value is provided, a temporary directory will be
  827        created and a default file name "stats.json" will be used
  828        :type json_file: str
  829        :return: The function `print_stats` does not return any value. It has a return type annotation
  830        of `None`.
  831        """
  832
  833        # Full path
  834        output_file = full_path(output_file)
  835        json_file = full_path(json_file)
  836
  837        with tempfile.TemporaryDirectory() as tmpdir:
  838
  839            # Files
  840            if not output_file:
  841                output_file = os.path.join(tmpdir, "stats.md")
  842            if not json_file:
  843                json_file = os.path.join(tmpdir, "stats.json")
  844
  845            # Create folders
  846            if not os.path.exists(os.path.dirname(output_file)):
  847                Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True)
  848            if not os.path.exists(os.path.dirname(json_file)):
  849                Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True)
  850
  851            # Create stats JSON file
  852            stats_file = self.stats_to_file(file=json_file)
  853
  854            # Print stats file
  855            with open(stats_file) as f:
  856                stats = yaml.safe_load(f)
  857
  858            # Output
  859            output_title = []
  860            output_index = []
  861            output = []
  862
  863            # Title
  864            output_title.append("# HOWARD Stats")
  865
  866            # Index
  867            output_index.append("## Index")
  868
  869            # Process sections
  870            for section in stats:
  871                infos = stats.get(section)
  872                section_link = "#" + section.lower().replace(" ", "-")
  873                output.append(f"## {section}")
  874                output_index.append(f"- [{section}]({section_link})")
  875
  876                if len(infos):
  877                    for info in infos:
  878                        try:
  879                            df = pd.DataFrame.from_dict(infos.get(info), orient="index")
  880                            is_df = True
  881                        except:
  882                            try:
  883                                df = pd.DataFrame.from_dict(
  884                                    json.loads((infos.get(info))), orient="index"
  885                                )
  886                                is_df = True
  887                            except:
  888                                is_df = False
  889                        if is_df:
  890                            output.append(f"### {info}")
  891                            info_link = "#" + info.lower().replace(" ", "-")
  892                            output_index.append(f"   - [{info}]({info_link})")
  893                            output.append(f"{df.to_markdown(index=False)}")
  894                        else:
  895                            output.append(f"- {info}: {infos.get(info)}")
  896                else:
  897                    output.append(f"NA")
  898
  899            # Write stats in markdown file
  900            with open(output_file, "w") as fp:
  901                for item in output_title:
  902                    fp.write("%s\n" % item)
  903                for item in output_index:
  904                    fp.write("%s\n" % item)
  905                for item in output:
  906                    fp.write("%s\n" % item)
  907
  908            # Output stats in markdown
  909            print("")
  910            print("\n\n".join(output_title))
  911            print("")
  912            print("\n\n".join(output))
  913            print("")
  914
  915        return None
  916
  917    def get_input(self) -> str:
  918        """
  919        It returns the value of the input variable.
  920        :return: The input is being returned.
  921        """
  922        return self.input
  923
  924    def get_input_format(self, input_file: str = None) -> str:
  925        """
  926        This function returns the format of the input variable, either from the provided input file or
  927        by prompting for input.
  928
  929        :param input_file: The `input_file` parameter in the `get_input_format` method is a string that
  930        represents the file path of the input file. If no `input_file` is provided when calling the
  931        method, it will default to `None`
  932        :type input_file: str
  933        :return: The format of the input variable is being returned.
  934        """
  935
  936        if not input_file:
  937            input_file = self.get_input()
  938        input_format = get_file_format(input_file)
  939        return input_format
  940
  941    def get_input_compressed(self, input_file: str = None) -> str:
  942        """
  943        The function `get_input_compressed` returns the format of the input variable after compressing
  944        it.
  945
  946        :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string
  947        that represents the file path of the input file. If no `input_file` is provided when calling the
  948        method, it will default to `None` and the method will then call `self.get_input()` to
  949        :type input_file: str
  950        :return: The function `get_input_compressed` returns the compressed format of the input
  951        variable.
  952        """
  953
  954        if not input_file:
  955            input_file = self.get_input()
  956        input_compressed = get_file_compressed(input_file)
  957        return input_compressed
  958
  959    def get_output(self) -> str:
  960        """
  961        It returns the output of the neuron.
  962        :return: The output of the neural network.
  963        """
  964
  965        return self.output
  966
  967    def get_output_format(self, output_file: str = None) -> str:
  968        """
  969        The function `get_output_format` returns the format of the input variable or the output file if
  970        provided.
  971
  972        :param output_file: The `output_file` parameter in the `get_output_format` method is a string
  973        that represents the file path of the output file. If no `output_file` is provided when calling
  974        the method, it will default to the output obtained from the `get_output` method of the class
  975        instance. The
  976        :type output_file: str
  977        :return: The format of the input variable is being returned.
  978        """
  979
  980        if not output_file:
  981            output_file = self.get_output()
  982        output_format = get_file_format(output_file)
  983
  984        return output_format
  985
  986    def get_config(self) -> dict:
  987        """
  988        It returns the config
  989        :return: The config variable is being returned.
  990        """
  991        return self.config
  992
  993    def get_param(self) -> dict:
  994        """
  995        It returns the param
  996        :return: The param variable is being returned.
  997        """
  998        return self.param
  999
 1000    def get_connexion_db(self) -> str:
 1001        """
 1002        It returns the connexion_db attribute of the object
 1003        :return: The connexion_db is being returned.
 1004        """
 1005        return self.connexion_db
 1006
 1007    def get_prefix(self) -> str:
 1008        """
 1009        It returns the prefix of the object.
 1010        :return: The prefix is being returned.
 1011        """
 1012        return self.prefix
 1013
 1014    def get_table_variants(self, clause: str = "select") -> str:
 1015        """
 1016        This function returns the table_variants attribute of the object
 1017
 1018        :param clause: the type of clause the table will be used. Either "select" or "from" (optional),
 1019        defaults to select (optional)
 1020        :return: The table_variants attribute of the object.
 1021        """
 1022
 1023        # Access
 1024        access = self.get_config().get("access", None)
 1025
 1026        # Clauses "select", "where", "update"
 1027        if clause in ["select", "where", "update"]:
 1028            table_variants = self.table_variants
 1029        # Clause "from"
 1030        elif clause in ["from"]:
 1031            # For Read Only
 1032            if self.get_input_format() in ["parquet"] and access in ["RO"]:
 1033                input_file = self.get_input()
 1034                table_variants = f"'{input_file}' as variants"
 1035            # For Read Write
 1036            else:
 1037                table_variants = f"{self.table_variants} as variants"
 1038        else:
 1039            table_variants = self.table_variants
 1040        return table_variants
 1041
 1042    def get_tmp_dir(self) -> str:
 1043        """
 1044        The function `get_tmp_dir` returns the temporary directory path based on configuration
 1045        parameters or a default path.
 1046        :return: The `get_tmp_dir` method is returning the temporary directory path based on the
 1047        configuration, parameters, and a default value of "/tmp".
 1048        """
 1049
 1050        return get_tmp(
 1051            config=self.get_config(), param=self.get_param(), default_tmp="/tmp"
 1052        )
 1053
 1054    def get_connexion_type(self) -> str:
 1055        """
 1056        If the connexion type is not in the list of allowed connexion types, raise a ValueError
 1057
 1058        :return: The connexion type is being returned.
 1059        """
 1060        return self.get_config().get("connexion_type", "memory")
 1061
 1062    def get_connexion(self):
 1063        """
 1064        It returns the connection object
 1065
 1066        :return: The connection object.
 1067        """
 1068        return self.conn
 1069
 1070    def close_connexion(self) -> None:
 1071        """
 1072        This function closes the connection to the database.
 1073        :return: The connection is being closed.
 1074        """
 1075        return self.conn.close()
 1076
 1077    def get_header(self, type: str = "vcf"):
 1078        """
 1079        This function returns the header of the VCF file as a list of strings
 1080
 1081        :param type: the type of header you want to get, defaults to vcf (optional)
 1082        :return: The header of the vcf file.
 1083        """
 1084
 1085        if self.header_vcf:
 1086            if type == "vcf":
 1087                return self.header_vcf
 1088            elif type == "list":
 1089                return self.header_list
 1090        else:
 1091            if type == "vcf":
 1092                header = vcf.Reader(io.StringIO("\n".join(vcf_required)))
 1093                return header
 1094            elif type == "list":
 1095                return vcf_required
 1096
 1097    def get_header_infos_list(self) -> list:
 1098        """
 1099        This function retrieves a list of information fields from the header.
 1100        :return: A list of information fields from the header.
 1101        """
 1102
 1103        # Init
 1104        infos_list = []
 1105
 1106        for field in self.get_header().infos:
 1107            infos_list.append(field)
 1108
 1109        return infos_list
 1110
 1111    def get_header_length(self, file: str = None) -> int:
 1112        """
 1113        The function `get_header_length` returns the length of the header list, excluding the #CHROM
 1114        line.
 1115
 1116        :param file: The `file` parameter is an optional argument that specifies the path to a VCF
 1117        header file. If this argument is provided, the function will read the header from the specified
 1118        file and return the length of the header list minus 1 (to exclude the #CHROM line)
 1119        :type file: str
 1120        :return: the length of the header list, excluding the #CHROM line.
 1121        """
 1122
 1123        if file:
 1124            return len(self.read_vcf_header_file(file=file)) - 1
 1125        elif self.get_header(type="list"):
 1126            return len(self.get_header(type="list")) - 1
 1127        else:
 1128            return 0
 1129
 1130    def get_header_columns(self) -> str:
 1131        """
 1132        This function returns the header list of a VCF
 1133
 1134        :return: The length of the header list.
 1135        """
 1136        if self.get_header():
 1137            return self.get_header(type="list")[-1]
 1138        else:
 1139            return ""
 1140
 1141    def get_header_columns_as_list(self) -> list:
 1142        """
 1143        This function returns the header list of a VCF
 1144
 1145        :return: The length of the header list.
 1146        """
 1147        if self.get_header():
 1148            return self.get_header_columns().strip().split("\t")
 1149        else:
 1150            return []
 1151
 1152    def get_header_columns_as_sql(self) -> str:
 1153        """
 1154        This function retruns header length (without #CHROM line)
 1155
 1156        :return: The length of the header list.
 1157        """
 1158        sql_column_list = []
 1159        for col in self.get_header_columns_as_list():
 1160            sql_column_list.append(f'"{col}"')
 1161        return ",".join(sql_column_list)
 1162
 1163    def get_header_sample_list(
 1164        self, check: bool = False, samples: list = None, samples_force: bool = False
 1165    ) -> list:
 1166        """
 1167        The function `get_header_sample_list` returns a list of samples from a VCF header, with optional
 1168        checking and filtering based on input parameters.
 1169
 1170        :param check: The `check` parameter in the `get_header_sample_list` function is a boolean
 1171        parameter that determines whether to check if the samples in the list are properly defined as
 1172        genotype columns. If `check` is set to `True`, the function will verify if each sample in the
 1173        list is defined as a, defaults to False
 1174        :type check: bool (optional)
 1175        :param samples: The `samples` parameter in the `get_header_sample_list` function is a list that
 1176        allows you to specify a subset of samples from the header. If you provide a list of sample
 1177        names, the function will check if each sample is defined in the header. If a sample is not found
 1178        in the
 1179        :type samples: list
 1180        :param samples_force: The `samples_force` parameter in the `get_header_sample_list` function is
 1181        a boolean parameter that determines whether to force the function to return the sample list
 1182        without checking if the samples are genotype columns. If `samples_force` is set to `True`, the
 1183        function will return the sample list without performing, defaults to False
 1184        :type samples_force: bool (optional)
 1185        :return: The function `get_header_sample_list` returns a list of samples based on the input
 1186        parameters and conditions specified in the function.
 1187        """
 1188
 1189        # Init
 1190        samples_list = []
 1191
 1192        if samples is None:
 1193            samples_list = self.header_vcf.samples
 1194        else:
 1195            samples_checked = []
 1196            for sample in samples:
 1197                if sample in self.header_vcf.samples:
 1198                    samples_checked.append(sample)
 1199                else:
 1200                    log.warning(f"Sample '{sample}' not defined in header")
 1201            samples_list = samples_checked
 1202
 1203            # Force sample list without checking if is_genotype_column
 1204            if samples_force:
 1205                log.warning(f"Samples {samples_list} not checked if genotypes")
 1206                return samples_list
 1207
 1208        if check:
 1209            samples_checked = []
 1210            for sample in samples_list:
 1211                if self.is_genotype_column(column=sample):
 1212                    samples_checked.append(sample)
 1213                else:
 1214                    log.warning(
 1215                        f"Sample '{sample}' not defined as a sample (genotype not well defined)"
 1216                    )
 1217            samples_list = samples_checked
 1218
 1219        # Return samples list
 1220        return samples_list
 1221
 1222    def is_genotype_column(self, column: str = None) -> bool:
 1223        """
 1224        This function checks if a given column is a genotype column in a database.
 1225
 1226        :param column: The `column` parameter in the `is_genotype_column` method is a string that
 1227        represents the column name in a database table. This method checks if the specified column is a
 1228        genotype column in the database. If a column name is provided, it calls the `is_genotype_column`
 1229        method of
 1230        :type column: str
 1231        :return: The `is_genotype_column` method is returning a boolean value. If the `column` parameter
 1232        is not None, it calls the `is_genotype_column` method of the `Database` class with the specified
 1233        column name and returns the result. If the `column` parameter is None, it returns False.
 1234        """
 1235
 1236        if column is not None:
 1237            return Database(database=self.get_input()).is_genotype_column(column=column)
 1238        else:
 1239            return False
 1240
 1241    def get_verbose(self) -> bool:
 1242        """
 1243        It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't
 1244        exist
 1245
 1246        :return: The value of the key "verbose" in the config dictionary.
 1247        """
 1248        return self.get_config().get("verbose", False)
 1249
 1250    def get_connexion_format(self) -> str:
 1251        """
 1252        It returns the connexion format of the object.
 1253        :return: The connexion_format is being returned.
 1254        """
 1255        connexion_format = self.connexion_format
 1256        if connexion_format not in ["duckdb", "sqlite"]:
 1257            log.error(f"Unknown connexion format {connexion_format}")
 1258            raise ValueError(f"Unknown connexion format {connexion_format}")
 1259        else:
 1260            return connexion_format
 1261
 1262    def insert_file_to_table(
 1263        self,
 1264        file,
 1265        columns: str,
 1266        header_len: int = 0,
 1267        sep: str = "\t",
 1268        chunksize: int = 1000000,
 1269    ) -> None:
 1270        """
 1271        The function reads a file in chunks and inserts each chunk into a table based on the specified
 1272        database format.
 1273
 1274        :param file: The `file` parameter is the file that you want to load into a table. It should be
 1275        the path to the file on your system
 1276        :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that
 1277        should contain the names of the columns in the table where the data will be inserted. The column
 1278        names should be separated by commas within the string. For example, if you have columns named
 1279        "id", "name
 1280        :type columns: str
 1281        :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies
 1282        the number of lines to skip at the beginning of the file before reading the actual data. This
 1283        parameter allows you to skip any header information present in the file before processing the
 1284        data, defaults to 0
 1285        :type header_len: int (optional)
 1286        :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the
 1287        separator character that is used in the file being read. In this case, the default separator is
 1288        set to `\t`, which represents a tab character. You can change this parameter to a different
 1289        separator character if, defaults to \t
 1290        :type sep: str (optional)
 1291        :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time
 1292        when processing the file in chunks. In the provided code snippet, the default value for
 1293        `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults
 1294        to 1000000
 1295        :type chunksize: int (optional)
 1296        """
 1297
 1298        # Config
 1299        chunksize = self.get_config().get("load", {}).get("chunk", chunksize)
 1300        connexion_format = self.get_connexion_format()
 1301
 1302        log.debug("chunksize: " + str(chunksize))
 1303
 1304        if chunksize:
 1305            for chunk in pd.read_csv(
 1306                file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c"
 1307            ):
 1308                if connexion_format in ["duckdb"]:
 1309                    sql_insert_into = (
 1310                        f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk"
 1311                    )
 1312                    self.conn.execute(sql_insert_into)
 1313                elif connexion_format in ["sqlite"]:
 1314                    chunk.to_sql("variants", self.conn, if_exists="append", index=False)
 1315
 1316    def load_data(
 1317        self,
 1318        input_file: str = None,
 1319        drop_variants_table: bool = False,
 1320        sample_size: int = 20480,
 1321    ) -> None:
 1322        """
 1323        The `load_data` function reads a VCF file and inserts it into a table, with options to drop the
 1324        table before loading the data and specify a sample size.
 1325
 1326        :param input_file: The path to the input file. This is the VCF file that will be loaded into the
 1327        table
 1328        :type input_file: str
 1329        :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that
 1330        determines whether the variants table should be dropped before loading the data. If set to
 1331        `True`, the variants table will be dropped. If set to `False` (default), the variants table will
 1332        not be dropped, defaults to False
 1333        :type drop_variants_table: bool (optional)
 1334        :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from
 1335        the input file. If it is set to `None`, the default value of 20480 will be used, defaults to
 1336        20480
 1337        :type sample_size: int (optional)
 1338        """
 1339
 1340        log.info("Loading...")
 1341
 1342        # change input file
 1343        if input_file:
 1344            self.set_input(input_file)
 1345            self.set_header()
 1346
 1347        # drop variants table
 1348        if drop_variants_table:
 1349            self.drop_variants_table()
 1350
 1351        # get table variants
 1352        table_variants = self.get_table_variants()
 1353
 1354        # Access
 1355        access = self.get_config().get("access", None)
 1356        log.debug(f"access: {access}")
 1357
 1358        # Input format and compress
 1359        input_format = self.get_input_format()
 1360        input_compressed = self.get_input_compressed()
 1361        log.debug(f"input_format: {input_format}")
 1362        log.debug(f"input_compressed: {input_compressed}")
 1363
 1364        # input_compressed_format
 1365        if input_compressed:
 1366            input_compressed_format = "gzip"
 1367        else:
 1368            input_compressed_format = "none"
 1369        log.debug(f"input_compressed_format: {input_compressed_format}")
 1370
 1371        # Connexion format
 1372        connexion_format = self.get_connexion_format()
 1373
 1374        # Sample size
 1375        if not sample_size:
 1376            sample_size = -1
 1377        log.debug(f"sample_size: {sample_size}")
 1378
 1379        # Load data
 1380        log.debug(f"Load Data from {input_format}")
 1381
 1382        # DuckDB connexion
 1383        if connexion_format in ["duckdb"]:
 1384
 1385            # Database already exists
 1386            if self.input_format in ["db", "duckdb"]:
 1387
 1388                if connexion_format in ["duckdb"]:
 1389                    log.debug(f"Input file format '{self.input_format}' duckDB")
 1390                else:
 1391                    log.error(
 1392                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
 1393                    )
 1394                    raise ValueError(
 1395                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
 1396                    )
 1397
 1398            # Load from existing database format
 1399            else:
 1400
 1401                try:
 1402                    # Create Table or View
 1403                    database = Database(database=self.input)
 1404                    sql_from = database.get_sql_from(sample_size=sample_size)
 1405
 1406                    if access in ["RO"]:
 1407                        sql_load = (
 1408                            f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}"
 1409                        )
 1410                    else:
 1411                        sql_load = (
 1412                            f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}"
 1413                        )
 1414                    self.conn.execute(sql_load)
 1415
 1416                except:
 1417                    # Format not available
 1418                    log.error(f"Input file format '{self.input_format}' not available")
 1419                    raise ValueError(
 1420                        f"Input file format '{self.input_format}' not available"
 1421                    )
 1422
 1423        # SQLite connexion
 1424        elif connexion_format in ["sqlite"] and input_format in [
 1425            "vcf",
 1426            "tsv",
 1427            "csv",
 1428            "psv",
 1429        ]:
 1430
 1431            # Main structure
 1432            structure = {
 1433                "#CHROM": "VARCHAR",
 1434                "POS": "INTEGER",
 1435                "ID": "VARCHAR",
 1436                "REF": "VARCHAR",
 1437                "ALT": "VARCHAR",
 1438                "QUAL": "VARCHAR",
 1439                "FILTER": "VARCHAR",
 1440                "INFO": "VARCHAR",
 1441            }
 1442
 1443            # Strcuture with samples
 1444            structure_complete = structure
 1445            if self.get_header_sample_list():
 1446                structure["FORMAT"] = "VARCHAR"
 1447                for sample in self.get_header_sample_list():
 1448                    structure_complete[sample] = "VARCHAR"
 1449
 1450            # Columns list for create and insert
 1451            sql_create_table_columns = []
 1452            sql_create_table_columns_list = []
 1453            for column in structure_complete:
 1454                column_type = structure_complete[column]
 1455                sql_create_table_columns.append(
 1456                    f'"{column}" {column_type} default NULL'
 1457                )
 1458                sql_create_table_columns_list.append(f'"{column}"')
 1459
 1460            # Create database
 1461            log.debug(f"Create Table {table_variants}")
 1462            sql_create_table_columns_sql = ", ".join(sql_create_table_columns)
 1463            sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list)
 1464            sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})"
 1465            self.conn.execute(sql_create_table)
 1466
 1467            # chunksize define length of file chunk load file
 1468            chunksize = 100000
 1469
 1470            # delimiter
 1471            delimiter = file_format_delimiters.get(input_format, "\t")
 1472
 1473            # Load the input file
 1474            with open(self.input, "rt") as input_file:
 1475
 1476                # Use the appropriate file handler based on the input format
 1477                if input_compressed:
 1478                    input_file = bgzf.open(self.input, "rt")
 1479                if input_format in ["vcf"]:
 1480                    header_len = self.get_header_length()
 1481                else:
 1482                    header_len = 0
 1483
 1484                # Insert the file contents into a table
 1485                self.insert_file_to_table(
 1486                    input_file,
 1487                    columns=sql_create_table_columns_list_sql,
 1488                    header_len=header_len,
 1489                    sep=delimiter,
 1490                    chunksize=chunksize,
 1491                )
 1492
 1493        else:
 1494            log.error(
 1495                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
 1496            )
 1497            raise ValueError(
 1498                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
 1499            )
 1500
 1501        # Explode INFOS fields into table fields
 1502        if self.get_explode_infos():
 1503            self.explode_infos(
 1504                prefix=self.get_explode_infos_prefix(),
 1505                fields=self.get_explode_infos_fields(),
 1506                force=True,
 1507            )
 1508
 1509        # Create index after insertion
 1510        self.create_indexes()
 1511
 1512    def get_explode_infos(self) -> bool:
 1513        """
 1514        The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting
 1515        to False if it is not set.
 1516        :return: The method is returning the value of the "explode_infos" parameter, which is a boolean
 1517        value. If the parameter is not present, it will return False.
 1518        """
 1519
 1520        return self.get_param().get("explode", {}).get("explode_infos", False)
 1521
 1522    def get_explode_infos_fields(
 1523        self,
 1524        explode_infos_fields: str = None,
 1525        remove_fields_not_in_header: bool = False,
 1526    ) -> list:
 1527        """
 1528        The `get_explode_infos_fields` function returns a list of exploded information fields based on
 1529        the input parameter `explode_infos_fields`.
 1530
 1531        :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the
 1532        fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a
 1533        comma-separated list of field names to explode
 1534        :type explode_infos_fields: str
 1535        :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean
 1536        flag that determines whether to remove fields that are not present in the header. If it is set
 1537        to `True`, any field that is not in the header will be excluded from the list of exploded
 1538        information fields. If it is set to `, defaults to False
 1539        :type remove_fields_not_in_header: bool (optional)
 1540        :return: The function `get_explode_infos_fields` returns a list of exploded information fields.
 1541        If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty
 1542        list. If the parameter is provided and its value is "ALL", it also returns an empty list.
 1543        Otherwise, it returns a list of exploded information fields after removing any spaces and
 1544        splitting the string by commas.
 1545        """
 1546
 1547        # If no fields, get it in param
 1548        if not explode_infos_fields:
 1549            explode_infos_fields = (
 1550                self.get_param().get("explode", {}).get("explode_infos_fields", None)
 1551            )
 1552
 1553        # If no fields, defined as all fields in header using keyword
 1554        if not explode_infos_fields:
 1555            explode_infos_fields = "*"
 1556
 1557        # If fields list not empty
 1558        if explode_infos_fields:
 1559
 1560            # Input fields list
 1561            if isinstance(explode_infos_fields, str):
 1562                fields_input = explode_infos_fields.split(",")
 1563            elif isinstance(explode_infos_fields, list):
 1564                fields_input = explode_infos_fields
 1565            else:
 1566                fields_input = []
 1567
 1568            # Fields list without * keyword
 1569            fields_without_all = fields_input.copy()
 1570            if "*".casefold() in (item.casefold() for item in fields_without_all):
 1571                fields_without_all.remove("*")
 1572
 1573            # Fields in header
 1574            fields_in_header = sorted(list(set(self.get_header().infos)))
 1575
 1576            # Construct list of fields
 1577            fields_output = []
 1578            for field in fields_input:
 1579
 1580                # Strip field
 1581                field = field.strip()
 1582
 1583                # format keyword * in regex
 1584                if field.upper() in ["*"]:
 1585                    field = ".*"
 1586
 1587                # Find all fields with pattern
 1588                r = re.compile(field)
 1589                fields_search = sorted(list(filter(r.match, fields_in_header)))
 1590
 1591                # Remove fields input from search
 1592                if field in fields_search:
 1593                    fields_search = [field]
 1594                elif fields_search != [field]:
 1595                    fields_search = sorted(
 1596                        list(set(fields_search).difference(fields_input))
 1597                    )
 1598
 1599                # If field is not in header (avoid not well formatted header)
 1600                if not fields_search and not remove_fields_not_in_header:
 1601                    fields_search = [field]
 1602
 1603                # Add found fields
 1604                for new_field in fields_search:
 1605                    # Add field, if not already exists, and if it is in header (if asked)
 1606                    if (
 1607                        new_field not in fields_output
 1608                        and (
 1609                            not remove_fields_not_in_header
 1610                            or new_field in fields_in_header
 1611                        )
 1612                        and new_field not in [".*"]
 1613                    ):
 1614                        fields_output.append(new_field)
 1615
 1616            return fields_output
 1617
 1618        else:
 1619
 1620            return []
 1621
 1622    def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str:
 1623        """
 1624        The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or
 1625        the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is
 1626        not provided.
 1627
 1628        :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a
 1629        prefix to be used for exploding or expanding information
 1630        :type explode_infos_prefix: str
 1631        :return: the value of the variable `explode_infos_prefix`.
 1632        """
 1633
 1634        if not explode_infos_prefix:
 1635            explode_infos_prefix = (
 1636                self.get_param().get("explode", {}).get("explode_infos_prefix", "")
 1637            )
 1638
 1639        return explode_infos_prefix
 1640
 1641    def add_column(
 1642        self,
 1643        table_name,
 1644        column_name,
 1645        column_type,
 1646        default_value=None,
 1647        drop: bool = False,
 1648    ) -> dict:
 1649        """
 1650        The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it
 1651        doesn't already exist.
 1652
 1653        :param table_name: The name of the table to which you want to add a column
 1654        :param column_name: The parameter "column_name" is the name of the column that you want to add
 1655        to the table
 1656        :param column_type: The `column_type` parameter specifies the data type of the column that you
 1657        want to add to the table. It should be a string that represents the desired data type, such as
 1658        "INTEGER", "TEXT", "REAL", etc
 1659        :param default_value: The `default_value` parameter is an optional parameter that specifies the
 1660        default value for the newly added column. If a default value is provided, it will be assigned to
 1661        the column for any existing rows that do not have a value for that column
 1662        :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column
 1663        if it already exists in the table. If `drop` is set to `True`, the function will drop the
 1664        existing column before adding the new column. If `drop` is set to `False` (default),, defaults
 1665        to False
 1666        :type drop: bool (optional)
 1667        :return: a boolean value indicating whether the column was successfully added to the table.
 1668        """
 1669
 1670        # added
 1671        added = False
 1672        dropped = False
 1673
 1674        # Check if the column already exists in the table
 1675        query = f""" SELECT * FROM {table_name} LIMIT 0 """
 1676        columns = self.get_query_to_df(query).columns.tolist()
 1677        if column_name.upper() in [c.upper() for c in columns]:
 1678            log.debug(
 1679                f"The {column_name} column already exists in the {table_name} table"
 1680            )
 1681            if drop:
 1682                self.drop_column(table_name=table_name, column_name=column_name)
 1683                dropped = True
 1684            else:
 1685                return None
 1686        else:
 1687            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
 1688
 1689        # Add column in table
 1690        add_column_query = (
 1691            f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """
 1692        )
 1693        if default_value is not None:
 1694            add_column_query += f" DEFAULT {default_value}"
 1695        self.execute_query(add_column_query)
 1696        added = not dropped
 1697        log.debug(
 1698            f"The {column_name} column was successfully added to the {table_name} table"
 1699        )
 1700
 1701        if added:
 1702            added_column = {
 1703                "table_name": table_name,
 1704                "column_name": column_name,
 1705                "column_type": column_type,
 1706                "default_value": default_value,
 1707            }
 1708        else:
 1709            added_column = None
 1710
 1711        return added_column
 1712
 1713    def drop_column(
 1714        self, column: dict = None, table_name: str = None, column_name: str = None
 1715    ) -> bool:
 1716        """
 1717        The `drop_column` function drops a specified column from a given table in a database and returns
 1718        True if the column was successfully dropped, and False if the column does not exist in the
 1719        table.
 1720
 1721        :param column: The `column` parameter is a dictionary that contains information about the column
 1722        you want to drop. It has two keys:
 1723        :type column: dict
 1724        :param table_name: The `table_name` parameter is the name of the table from which you want to
 1725        drop a column
 1726        :type table_name: str
 1727        :param column_name: The `column_name` parameter is the name of the column that you want to drop
 1728        from the table
 1729        :type column_name: str
 1730        :return: a boolean value. It returns True if the column was successfully dropped from the table,
 1731        and False if the column does not exist in the table.
 1732        """
 1733
 1734        # Find column infos
 1735        if column:
 1736            if isinstance(column, dict):
 1737                table_name = column.get("table_name", None)
 1738                column_name = column.get("column_name", None)
 1739            elif isinstance(column, str):
 1740                table_name = self.get_table_variants()
 1741                column_name = column
 1742            else:
 1743                table_name = None
 1744                column_name = None
 1745
 1746        if not table_name and not column_name:
 1747            return False
 1748
 1749        # Removed
 1750        removed = False
 1751
 1752        # Check if the column already exists in the table
 1753        query = f""" SELECT * FROM {table_name} LIMIT 0 """
 1754        columns = self.get_query_to_df(query).columns.tolist()
 1755        if column_name in columns:
 1756            log.debug(f"The {column_name} column exists in the {table_name} table")
 1757        else:
 1758            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
 1759            return False
 1760
 1761        # Add column in table # ALTER TABLE integers DROP k
 1762        add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """
 1763        self.execute_query(add_column_query)
 1764        removed = True
 1765        log.debug(
 1766            f"The {column_name} column was successfully dropped to the {table_name} table"
 1767        )
 1768
 1769        return removed
 1770
 1771    def explode_infos(
 1772        self,
 1773        prefix: str = None,
 1774        create_index: bool = False,
 1775        fields: list = None,
 1776        force: bool = False,
 1777        proccess_all_fields_together: bool = False,
 1778        table: str = None,
 1779    ) -> list:
 1780        """
 1781        The `explode_infos` function in Python takes a VCF file and explodes the INFO fields into
 1782        individual columns, returning a list of added columns.
 1783
 1784        :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO
 1785        fields. If the `prefix` is not provided or is set to `None`, the function will use the value of
 1786        `self.get_explode_infos_prefix()` as the prefix
 1787        :type prefix: str
 1788        :param create_index: The `create_index` parameter is a boolean flag that specifies whether to
 1789        create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to
 1790        `False`, indexes will not be created. The default value is `False`, defaults to False
 1791        :type create_index: bool (optional)
 1792        :param fields: The `fields` parameter in the `explode_infos` function is a list of INFO fields
 1793        that you want to explode into individual columns. If this parameter is not provided, all INFO
 1794        fields will be exploded. You can specify the INFO fields you want to explode by passing them as
 1795        a list to the `
 1796        :type fields: list
 1797        :param force: The `force` parameter in the `explode_infos` function is a boolean flag that
 1798        determines whether to drop and recreate a column if it already exists in the table. If `force`
 1799        is set to `True`, the column will be dropped and recreated. If `force` is set to `False,
 1800        defaults to False
 1801        :type force: bool (optional)
 1802        :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean
 1803        flag that determines whether to process all the INFO fields together or individually. If set to
 1804        `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will
 1805        be processed individually. The default value is, defaults to False
 1806        :type proccess_all_fields_together: bool (optional)
 1807        :param table: The `table` parameter in the `explode_infos` function is used to specify the name
 1808        of the table where the exploded INFO fields will be added as individual columns. If you provide
 1809        a value for the `table` parameter, the function will use that table name. If the `table`
 1810        parameter is
 1811        :type table: str
 1812        :return: The `explode_infos` function returns a list of added columns.
 1813        """
 1814
 1815        # drop indexes
 1816        self.drop_indexes()
 1817
 1818        # connexion format
 1819        connexion_format = self.get_connexion_format()
 1820
 1821        # Access
 1822        access = self.get_config().get("access", None)
 1823
 1824        # Added columns
 1825        added_columns = []
 1826
 1827        if access not in ["RO"]:
 1828
 1829            # prefix
 1830            if prefix in [None, True] or not isinstance(prefix, str):
 1831                if self.get_explode_infos_prefix() not in [None, True]:
 1832                    prefix = self.get_explode_infos_prefix()
 1833                else:
 1834                    prefix = "INFO/"
 1835
 1836            # table variants
 1837            if table is not None:
 1838                table_variants = table
 1839            else:
 1840                table_variants = self.get_table_variants(clause="select")
 1841
 1842            # extra infos
 1843            try:
 1844                extra_infos = self.get_extra_infos()
 1845            except:
 1846                extra_infos = []
 1847
 1848            # Header infos
 1849            header_infos = self.get_header().infos
 1850
 1851            log.debug(
 1852                f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields"
 1853            )
 1854
 1855            sql_info_alter_table_array = []
 1856
 1857            # Info fields to check
 1858            fields_list = list(header_infos)
 1859            if fields:
 1860                fields_list += fields
 1861            fields_list = set(fields_list)
 1862
 1863            # If no fields
 1864            if not fields:
 1865                fields = []
 1866
 1867            # Translate fields if patterns
 1868            fields = self.get_explode_infos_fields(explode_infos_fields=fields)
 1869
 1870            for info in fields:
 1871
 1872                info_id_sql = prefix + info
 1873
 1874                if (
 1875                    info in fields_list
 1876                    or prefix + info in fields_list
 1877                    or info in extra_infos
 1878                ):
 1879
 1880                    log.debug(f"Explode INFO fields - ADD '{info}' annotations fields")
 1881
 1882                    if info in header_infos:
 1883                        info_type = header_infos[info].type
 1884                        info_num = header_infos[info].num
 1885                    else:
 1886                        info_type = "String"
 1887                        info_num = 0
 1888
 1889                    type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR")
 1890                    if info_num != 1:
 1891                        type_sql = "VARCHAR"
 1892
 1893                    # Add field
 1894                    added_column = self.add_column(
 1895                        table_name=table_variants,
 1896                        column_name=info_id_sql,
 1897                        column_type=type_sql,
 1898                        default_value="null",
 1899                        drop=force,
 1900                    )
 1901
 1902                    if added_column:
 1903                        added_columns.append(added_column)
 1904
 1905                    if added_column or force:
 1906
 1907                        # add field to index
 1908                        self.index_additionnal_fields.append(info_id_sql)
 1909
 1910                        # Update field array
 1911                        if connexion_format in ["duckdb"]:
 1912                            update_info_field = f"""
 1913                            "{info_id_sql}" =
 1914                                CASE
 1915                                    WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL
 1916                                    ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1)
 1917                                END
 1918                            """
 1919                        elif connexion_format in ["sqlite"]:
 1920                            update_info_field = f"""
 1921                                "{info_id_sql}" =
 1922                                    CASE
 1923                                        WHEN instr(INFO, '{info}=') = 0 THEN NULL
 1924                                        WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1)
 1925                                        ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1)
 1926                                    END
 1927                            """
 1928
 1929                        sql_info_alter_table_array.append(update_info_field)
 1930
 1931            if sql_info_alter_table_array:
 1932
 1933                # By chromosomes
 1934                try:
 1935                    chromosomes_list = list(
 1936                        self.get_query_to_df(
 1937                            f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """
 1938                        )["#CHROM"]
 1939                    )
 1940                except:
 1941                    chromosomes_list = [None]
 1942
 1943                for chrom in chromosomes_list:
 1944                    log.debug(f"Explode INFO fields - Chromosome {chrom}...")
 1945
 1946                    # Where clause
 1947                    where_clause = ""
 1948                    if chrom and len(chromosomes_list) > 1:
 1949                        where_clause = f""" WHERE "#CHROM" = '{chrom}' """
 1950
 1951                    # Update table
 1952                    if proccess_all_fields_together:
 1953                        sql_info_alter_table_array_join = ", ".join(
 1954                            sql_info_alter_table_array
 1955                        )
 1956                        if sql_info_alter_table_array_join:
 1957                            sql_info_alter_table = f"""
 1958                                UPDATE {table_variants}
 1959                                SET {sql_info_alter_table_array_join}
 1960                                {where_clause}
 1961                                """
 1962                            log.debug(
 1963                                f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..."
 1964                            )
 1965                            # log.debug(sql_info_alter_table)
 1966                            self.conn.execute(sql_info_alter_table)
 1967                    else:
 1968                        sql_info_alter_num = 0
 1969                        for sql_info_alter in sql_info_alter_table_array:
 1970                            sql_info_alter_num += 1
 1971                            sql_info_alter_table = f"""
 1972                                UPDATE {table_variants}
 1973                                SET {sql_info_alter}
 1974                                {where_clause}
 1975                                """
 1976                            log.debug(
 1977                                f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..."
 1978                            )
 1979                            # log.debug(sql_info_alter_table)
 1980                            self.conn.execute(sql_info_alter_table)
 1981
 1982        # create indexes
 1983        if create_index:
 1984            self.create_indexes()
 1985
 1986        return added_columns
 1987
 1988    def create_indexes(self) -> None:
 1989        """
 1990        Create indexes on the table after insertion
 1991        """
 1992
 1993        # Access
 1994        access = self.get_config().get("access", None)
 1995
 1996        # get table variants
 1997        table_variants = self.get_table_variants("FROM")
 1998
 1999        if self.get_indexing() and access not in ["RO"]:
 2000            # Create index
 2001            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")'
 2002            self.conn.execute(sql_create_table_index)
 2003            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")'
 2004            self.conn.execute(sql_create_table_index)
 2005            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")'
 2006            self.conn.execute(sql_create_table_index)
 2007            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")'
 2008            self.conn.execute(sql_create_table_index)
 2009            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")'
 2010            self.conn.execute(sql_create_table_index)
 2011            for field in self.index_additionnal_fields:
 2012                sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """
 2013                self.conn.execute(sql_create_table_index)
 2014
 2015    def drop_indexes(self) -> None:
 2016        """
 2017        Create indexes on the table after insertion
 2018        """
 2019
 2020        # Access
 2021        access = self.get_config().get("access", None)
 2022
 2023        # get table variants
 2024        table_variants = self.get_table_variants("FROM")
 2025
 2026        # Get database format
 2027        connexion_format = self.get_connexion_format()
 2028
 2029        if access not in ["RO"]:
 2030            if connexion_format in ["duckdb"]:
 2031                sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'"
 2032            elif connexion_format in ["sqlite"]:
 2033                sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';"
 2034
 2035            list_indexes = self.conn.execute(sql_list_indexes)
 2036            index_names = [row[0] for row in list_indexes.fetchall()]
 2037            for index in index_names:
 2038                sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """
 2039                self.conn.execute(sql_drop_table_index)
 2040
 2041    def read_vcf_header(self, f) -> list:
 2042        """
 2043        It reads the header of a VCF file and returns a list of the header lines
 2044
 2045        :param f: the file object
 2046        :return: The header lines of the VCF file.
 2047        """
 2048
 2049        header_list = []
 2050        for line in f:
 2051            header_list.append(line)
 2052            if line.startswith("#CHROM"):
 2053                break
 2054        return header_list
 2055
 2056    def read_vcf_header_file(self, file: str = None) -> list:
 2057        """
 2058        The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and
 2059        uncompressed files.
 2060
 2061        :param file: The `file` parameter is a string that represents the path to the VCF header file
 2062        that you want to read. It is an optional parameter, so if you don't provide a value, it will
 2063        default to `None`
 2064        :type file: str
 2065        :return: The function `read_vcf_header_file` returns a list.
 2066        """
 2067
 2068        if self.get_input_compressed(input_file=file):
 2069            with bgzf.open(file, "rt") as f:
 2070                return self.read_vcf_header(f=f)
 2071        else:
 2072            with open(file, "rt") as f:
 2073                return self.read_vcf_header(f=f)
 2074
 2075    def execute_query(self, query: str):
 2076        """
 2077        It takes a query as an argument, executes it, and returns the results
 2078
 2079        :param query: The query to be executed
 2080        :return: The result of the query is being returned.
 2081        """
 2082        if query:
 2083            return self.conn.execute(query)  # .fetchall()
 2084        else:
 2085            return None
 2086
 2087    def export_output(
 2088        self,
 2089        output_file: str | None = None,
 2090        output_header: str | None = None,
 2091        export_header: bool = True,
 2092        query: str | None = None,
 2093        parquet_partitions: list | None = None,
 2094        chunk_size: int | None = None,
 2095        threads: int | None = None,
 2096        sort: bool = False,
 2097        index: bool = False,
 2098        order_by: str | None = None,
 2099    ) -> bool:
 2100        """
 2101        The `export_output` function exports data from a VCF file to a specified output file in various
 2102        formats, including VCF, CSV, TSV, PSV, and Parquet.
 2103
 2104        :param output_file: The `output_file` parameter is a string that specifies the name of the
 2105        output file to be generated by the function. This is where the exported data will be saved
 2106        :type output_file: str
 2107        :param output_header: The `output_header` parameter is a string that specifies the name of the
 2108        file where the header of the VCF file will be exported. If this parameter is not provided, the
 2109        header will be exported to a file with the same name as the `output_file` parameter, but with
 2110        the extension "
 2111        :type output_header: str
 2112        :param export_header: The `export_header` parameter is a boolean flag that determines whether
 2113        the header of a VCF file should be exported to a separate file or not. If `export_header` is
 2114        True, the header will be exported to a file. If `export_header` is False, the header will not
 2115        be, defaults to True, if output format is not VCF
 2116        :type export_header: bool (optional)
 2117        :param query: The `query` parameter is an optional SQL query that can be used to filter and
 2118        select specific data from the VCF file before exporting it. If provided, only the data that
 2119        matches the query will be exported
 2120        :type query: str
 2121        :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the
 2122        columns to be used for partitioning the Parquet file during export. Partitioning is a way to
 2123        organize data in a hierarchical directory structure based on the values of one or more columns.
 2124        This can improve query performance when working with large datasets
 2125        :type parquet_partitions: list
 2126        :param chunk_size: The `chunk_size` parameter specifies the number of
 2127        records in batch when exporting data in Parquet format. This parameter is used for
 2128        partitioning the Parquet file into multiple files.
 2129        :type chunk_size: int
 2130        :param threads: The `threads` parameter is an optional parameter that specifies the number of
 2131        threads to be used during the export process. It determines the level of parallelism and can
 2132        improve the performance of the export operation. If not provided, the function will use the
 2133        default number of threads
 2134        :type threads: int
 2135        :param sort: The `sort` parameter is a boolean flag that determines whether the output file
 2136        should be sorted or not. If `sort` is set to `True`, the output file will be sorted based on the
 2137        genomic coordinates of the variants. By default, the value of `sort` is `False`, defaults to
 2138        False
 2139        :type sort: bool (optional)
 2140        :param index: The `index` parameter is a boolean flag that determines whether an index should be
 2141        created on the output file. If `index` is True, an index will be created. If `index` is False,
 2142        no index will be created. The default value is False, defaults to False
 2143        :type index: bool (optional)
 2144        :param order_by: The `order_by` parameter is a string that specifies the column(s) to use for
 2145        sorting the output file. This parameter is only applicable when exporting data in VCF format
 2146        :type order_by: str
 2147        :return: a boolean value. It checks if the output file exists and returns True if it does, or
 2148        None if it doesn't.
 2149        """
 2150
 2151        # Log
 2152        log.info("Exporting...")
 2153
 2154        # Full path
 2155        output_file = full_path(output_file)
 2156        output_header = full_path(output_header)
 2157
 2158        # Config
 2159        config = self.get_config()
 2160
 2161        # Param
 2162        param = self.get_param()
 2163
 2164        # Tmp files to remove
 2165        tmp_to_remove = []
 2166
 2167        # If no output, get it
 2168        if not output_file:
 2169            output_file = self.get_output()
 2170
 2171        # If not threads
 2172        if not threads:
 2173            threads = self.get_threads()
 2174
 2175        # Auto header name with extension
 2176        if export_header or output_header:
 2177            if not output_header:
 2178                output_header = f"{output_file}.hdr"
 2179            # Export header
 2180            self.export_header(output_file=output_file)
 2181
 2182        # Switch off export header if VCF output
 2183        output_file_type = get_file_format(output_file)
 2184        if output_file_type in ["vcf"]:
 2185            export_header = False
 2186            tmp_to_remove.append(output_header)
 2187
 2188        # Chunk size
 2189        if not chunk_size:
 2190            chunk_size = config.get("chunk_size", None)
 2191
 2192        # Parquet partition
 2193        if not parquet_partitions:
 2194            parquet_partitions = param.get("export", {}).get("parquet_partitions", None)
 2195        if parquet_partitions and isinstance(parquet_partitions, str):
 2196            parquet_partitions = parquet_partitions.split(",")
 2197
 2198        # Order by
 2199        if not order_by:
 2200            order_by = param.get("export", {}).get("order_by", "")
 2201
 2202        # Header in output
 2203        header_in_output = param.get("export", {}).get("include_header", False)
 2204
 2205        # Database
 2206        database_source = self.get_connexion()
 2207
 2208        # Connexion format
 2209        connexion_format = self.get_connexion_format()
 2210
 2211        # Explode infos
 2212        if self.get_explode_infos():
 2213            self.explode_infos(
 2214                prefix=self.get_explode_infos_prefix(),
 2215                fields=self.get_explode_infos_fields(),
 2216                force=False,
 2217            )
 2218
 2219        # if connexion_format in ["sqlite"] or query:
 2220        if connexion_format in ["sqlite"]:
 2221
 2222            # Export in Parquet
 2223            random_tmp = "".join(
 2224                random.choice(string.ascii_lowercase) for i in range(10)
 2225            )
 2226            database_source = f"""{output_file}.{random_tmp}.database_export.parquet"""
 2227            tmp_to_remove.append(database_source)
 2228
 2229            # Table Variants
 2230            table_variants = self.get_table_variants()
 2231
 2232            # Create export query
 2233            sql_query_export_subquery = f"""
 2234                SELECT * FROM {table_variants}
 2235                """
 2236
 2237            # Write source file
 2238            fp.write(database_source, self.get_query_to_df(sql_query_export_subquery))
 2239
 2240        # Create database
 2241        database = Database(
 2242            database=database_source,
 2243            table="variants",
 2244            header_file=output_header,
 2245            conn_config=self.get_connexion_config(),
 2246        )
 2247
 2248        # Existing colomns header
 2249        existing_columns_header = database.get_header_columns_from_database(query=query)
 2250
 2251        # Sample list
 2252        if output_file_type in ["vcf"]:
 2253            get_samples = self.get_samples()
 2254            get_samples_check = self.get_samples_check()
 2255            samples_force = get_samples is not None
 2256            sample_list = self.get_header_sample_list(
 2257                check=get_samples_check,
 2258                samples=get_samples,
 2259                samples_force=samples_force,
 2260            )
 2261        else:
 2262            sample_list = None
 2263
 2264        # Export file
 2265        database.export(
 2266            output_database=output_file,
 2267            output_header=output_header,
 2268            existing_columns_header=existing_columns_header,
 2269            parquet_partitions=parquet_partitions,
 2270            chunk_size=chunk_size,
 2271            threads=threads,
 2272            sort=sort,
 2273            index=index,
 2274            header_in_output=header_in_output,
 2275            order_by=order_by,
 2276            query=query,
 2277            export_header=export_header,
 2278            sample_list=sample_list,
 2279        )
 2280
 2281        # Remove
 2282        remove_if_exists(tmp_to_remove)
 2283
 2284        return (os.path.exists(output_file) or None) and (
 2285            os.path.exists(output_file) or None
 2286        )
 2287
 2288    def get_extra_infos(self, table: str = None) -> list:
 2289        """
 2290        The `get_extra_infos` function returns a list of columns that are in a specified table but not
 2291        in the header.
 2292
 2293        :param table: The `table` parameter in the `get_extra_infos` function is used to specify the
 2294        name of the table from which you want to retrieve the extra columns that are not present in the
 2295        header. If the `table` parameter is not provided when calling the function, it will default to
 2296        using the variants
 2297        :type table: str
 2298        :return: A list of columns that are in the specified table but not in the header of the table.
 2299        """
 2300
 2301        header_columns = []
 2302
 2303        if not table:
 2304            table = self.get_table_variants(clause="from")
 2305            header_columns = self.get_header_columns()
 2306
 2307        # Check all columns in the database
 2308        query = f""" SELECT * FROM {table} LIMIT 1 """
 2309        log.debug(f"query {query}")
 2310        table_columns = self.get_query_to_df(query).columns.tolist()
 2311        extra_columns = []
 2312
 2313        # Construct extra infos (not in header)
 2314        for column in table_columns:
 2315            if column not in header_columns:
 2316                extra_columns.append(column)
 2317
 2318        return extra_columns
 2319
 2320    def get_extra_infos_sql(self, table: str = None) -> str:
 2321        """
 2322        It returns a string of the extra infos, separated by commas, and each extra info is surrounded
 2323        by double quotes
 2324
 2325        :param table: The name of the table to get the extra infos from. If None, the default table is
 2326        used
 2327        :type table: str
 2328        :return: A string of the extra infos
 2329        """
 2330
 2331        return ", ".join(
 2332            ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)]
 2333        )
 2334
 2335    def export_header(
 2336        self,
 2337        header_name: str = None,
 2338        output_file: str = None,
 2339        output_file_ext: str = ".hdr",
 2340        clean_header: bool = True,
 2341        remove_chrom_line: bool = False,
 2342    ) -> str:
 2343        """
 2344        The `export_header` function takes a VCF file, extracts the header, modifies it according to
 2345        specified options, and writes it to a new file.
 2346
 2347        :param header_name: The `header_name` parameter is the name of the header file to be created. If
 2348        this parameter is not specified, the header will be written to the output file
 2349        :type header_name: str
 2350        :param output_file: The `output_file` parameter in the `export_header` function is used to
 2351        specify the name of the output file where the header will be written. If this parameter is not
 2352        provided, the header will be written to a temporary file
 2353        :type output_file: str
 2354        :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a
 2355        string that represents the extension of the output header file. By default, it is set to ".hdr"
 2356        if not specified by the user. This extension will be appended to the `output_file` name to
 2357        create the final, defaults to .hdr
 2358        :type output_file_ext: str (optional)
 2359        :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean
 2360        flag that determines whether the header should be cleaned or not. When `clean_header` is set to
 2361        `True`, the function will clean the header by modifying certain lines based on a specific
 2362        pattern. If `clean_header`, defaults to True
 2363        :type clean_header: bool (optional)
 2364        :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a
 2365        boolean flag that determines whether the #CHROM line should be removed from the header before
 2366        writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `,
 2367        defaults to False
 2368        :type remove_chrom_line: bool (optional)
 2369        :return: The function `export_header` returns the name of the temporary header file that is
 2370        created.
 2371        """
 2372
 2373        if not header_name and not output_file:
 2374            output_file = self.get_output()
 2375
 2376        if self.get_header():
 2377
 2378            # Get header object
 2379            header_obj = self.get_header()
 2380
 2381            # Create database
 2382            db_for_header = Database(database=self.get_input())
 2383
 2384            # Get real columns in the file
 2385            db_header_columns = db_for_header.get_columns()
 2386
 2387            with tempfile.TemporaryDirectory() as tmpdir:
 2388
 2389                # Write header file
 2390                header_file_tmp = os.path.join(tmpdir, "header")
 2391                f = open(header_file_tmp, "w")
 2392                vcf.Writer(f, header_obj)
 2393                f.close()
 2394
 2395                # Replace #CHROM line with rel columns
 2396                header_list = db_for_header.read_header_file(
 2397                    header_file=header_file_tmp
 2398                )
 2399                header_list[-1] = "\t".join(db_header_columns)
 2400
 2401                # Remove CHROM line
 2402                if remove_chrom_line:
 2403                    header_list.pop()
 2404
 2405                # Clean header
 2406                if clean_header:
 2407                    header_list_clean = []
 2408                    for head in header_list:
 2409                        # Clean head for malformed header
 2410                        head_clean = head
 2411                        head_clean = re.subn(
 2412                            "##FORMAT=<ID=(.*),Number=(.*),Type=Flag",
 2413                            r"##FORMAT=<ID=\1,Number=\2,Type=String",
 2414                            head_clean,
 2415                            2,
 2416                        )[0]
 2417                        # Write header
 2418                        header_list_clean.append(head_clean)
 2419                    header_list = header_list_clean
 2420
 2421            tmp_header_name = output_file + output_file_ext
 2422
 2423            f = open(tmp_header_name, "w")
 2424            for line in header_list:
 2425                f.write(line)
 2426            f.close()
 2427
 2428        return tmp_header_name
 2429
 2430    def export_variant_vcf(
 2431        self,
 2432        vcf_file,
 2433        remove_info: bool = False,
 2434        add_samples: bool = True,
 2435        list_samples: list = [],
 2436        where_clause: str = "",
 2437        index: bool = False,
 2438        threads: int | None = None,
 2439    ) -> bool | None:
 2440        """
 2441        The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to
 2442        remove INFO field, add samples, and control compression and indexing.
 2443
 2444        :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be
 2445        written to. It is the output file that will contain the filtered VCF data based on the specified
 2446        parameters
 2447        :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a
 2448        boolean flag that determines whether to remove the INFO field from the output VCF file. If set
 2449        to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included
 2450        in, defaults to False
 2451        :type remove_info: bool (optional)
 2452        :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether
 2453        the samples should be added to the VCF file or not. If set to True, the samples will be added.
 2454        If set to False, the samples will be removed. The default value is True, defaults to True
 2455        :type add_samples: bool (optional)
 2456        :param list_samples: The `list_samples` parameter is a list of samples that you want to include
 2457        in the output VCF file. By default, all samples will be included. If you provide a list of
 2458        samples, only those samples will be included in the output file
 2459        :type list_samples: list
 2460        :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that
 2461        determines whether or not to create an index for the output VCF file. If `index` is set to
 2462        `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False
 2463        :type index: bool (optional)
 2464        :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the
 2465        number of threads to use for exporting the VCF file. It determines how many parallel threads
 2466        will be used during the export process. More threads can potentially speed up the export process
 2467        by utilizing multiple cores of the processor. If
 2468        :type threads: int | None
 2469        :return: The `export_variant_vcf` function returns the result of calling the `export_output`
 2470        method with various parameters including the output file, query, threads, sort flag, and index
 2471        flag. The `export_output` method is responsible for exporting the VCF data based on the
 2472        specified parameters and configurations provided in the `export_variant_vcf` function.
 2473        """
 2474
 2475        # Config
 2476        config = self.get_config()
 2477
 2478        # Extract VCF
 2479        log.debug("Export VCF...")
 2480
 2481        # Table variants
 2482        table_variants = self.get_table_variants()
 2483
 2484        # Threads
 2485        if not threads:
 2486            threads = self.get_threads()
 2487
 2488        # Info fields
 2489        if remove_info:
 2490            if not isinstance(remove_info, str):
 2491                remove_info = "."
 2492            info_field = f"""'{remove_info}' as INFO"""
 2493        else:
 2494            info_field = "INFO"
 2495
 2496        # Samples fields
 2497        if add_samples:
 2498            if not list_samples:
 2499                list_samples = self.get_header_sample_list()
 2500            if list_samples:
 2501                samples_fields = " , FORMAT , " + " , ".join(list_samples)
 2502            else:
 2503                samples_fields = ""
 2504            log.debug(f"samples_fields: {samples_fields}")
 2505        else:
 2506            samples_fields = ""
 2507
 2508        # Where clause
 2509        if where_clause is None:
 2510            where_clause = ""
 2511
 2512        # Variants
 2513        select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """
 2514        sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """
 2515        log.debug(f"sql_query_select={sql_query_select}")
 2516
 2517        return self.export_output(
 2518            output_file=vcf_file,
 2519            output_header=None,
 2520            export_header=True,
 2521            query=sql_query_select,
 2522            parquet_partitions=None,
 2523            chunk_size=config.get("chunk_size", None),
 2524            threads=threads,
 2525            sort=True,
 2526            index=index,
 2527            order_by=None,
 2528        )
 2529
 2530    def run_commands(self, commands: list = [], threads: int = 1) -> None:
 2531        """
 2532        It takes a list of commands and runs them in parallel using the number of threads specified
 2533
 2534        :param commands: A list of commands to run
 2535        :param threads: The number of threads to use, defaults to 1 (optional)
 2536        """
 2537
 2538        run_parallel_commands(commands, threads)
 2539
 2540    def get_threads(self, default: int = 1) -> int:
 2541        """
 2542        This function returns the number of threads to use for a job, with a default value of 1 if not
 2543        specified.
 2544
 2545        :param default: The `default` parameter in the `get_threads` method is used to specify the
 2546        default number of threads to use if no specific value is provided. If no value is provided for
 2547        the `threads` parameter in the configuration or input parameters, the `default` value will be
 2548        used, defaults to 1
 2549        :type default: int (optional)
 2550        :return: the number of threads to use for the current job.
 2551        """
 2552
 2553        # Config
 2554        config = self.get_config()
 2555
 2556        # Param
 2557        param = self.get_param()
 2558
 2559        # Input threads
 2560        input_thread = param.get("threads", config.get("threads", None))
 2561
 2562        # Check threads
 2563        if not input_thread:
 2564            threads = default
 2565        elif int(input_thread) <= 0:
 2566            threads = os.cpu_count()
 2567        else:
 2568            threads = int(input_thread)
 2569        return threads
 2570
 2571    def get_memory(self, default: str = None) -> str:
 2572        """
 2573        This function retrieves the memory value from parameters or configuration with a default value
 2574        if not found.
 2575
 2576        :param default: The `get_memory` function takes in a default value as a string parameter. This
 2577        default value is used as a fallback in case the `memory` parameter is not provided in the
 2578        `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary,
 2579        the function
 2580        :type default: str
 2581        :return: The `get_memory` function returns a string value representing the memory parameter. If
 2582        the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will
 2583        return the default value provided as an argument to the function.
 2584        """
 2585
 2586        # Config
 2587        config = self.get_config()
 2588
 2589        # Param
 2590        param = self.get_param()
 2591
 2592        # Input threads
 2593        input_memory = param.get("memory", config.get("memory", None))
 2594
 2595        # Check threads
 2596        if input_memory:
 2597            memory = input_memory
 2598        else:
 2599            memory = default
 2600
 2601        return memory
 2602
 2603    def update_from_vcf(self, vcf_file: str) -> None:
 2604        """
 2605        > If the database is duckdb, then use the parquet method, otherwise use the sqlite method
 2606
 2607        :param vcf_file: the path to the VCF file
 2608        """
 2609
 2610        connexion_format = self.get_connexion_format()
 2611
 2612        if connexion_format in ["duckdb"]:
 2613            self.update_from_vcf_duckdb(vcf_file)
 2614        elif connexion_format in ["sqlite"]:
 2615            self.update_from_vcf_sqlite(vcf_file)
 2616
 2617    def update_from_vcf_duckdb(self, vcf_file: str) -> None:
 2618        """
 2619        It takes a VCF file and updates the INFO column of the variants table in the database with the
 2620        INFO column of the VCF file
 2621
 2622        :param vcf_file: the path to the VCF file
 2623        """
 2624
 2625        # varaints table
 2626        table_variants = self.get_table_variants()
 2627
 2628        # Loading VCF into temporaire table
 2629        skip = self.get_header_length(file=vcf_file)
 2630        vcf_df = pd.read_csv(
 2631            vcf_file,
 2632            sep="\t",
 2633            engine="c",
 2634            skiprows=skip,
 2635            header=0,
 2636            low_memory=False,
 2637        )
 2638        sql_query_update = f"""
 2639        UPDATE {table_variants} as table_variants
 2640            SET INFO = concat(
 2641                            CASE
 2642                                WHEN INFO NOT IN ('', '.')
 2643                                THEN INFO
 2644                                ELSE ''
 2645                            END,
 2646                            (
 2647                                SELECT 
 2648                                    concat(
 2649                                        CASE
 2650                                            WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.')
 2651                                            THEN ';'
 2652                                            ELSE ''
 2653                                        END
 2654                                        ,
 2655                                        CASE
 2656                                            WHEN table_parquet.INFO NOT IN ('','.')
 2657                                            THEN table_parquet.INFO
 2658                                            ELSE ''
 2659                                        END
 2660                                    )
 2661                                FROM vcf_df as table_parquet
 2662                                        WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR)
 2663                                        AND table_parquet.\"POS\" = table_variants.\"POS\"
 2664                                        AND table_parquet.\"ALT\" = table_variants.\"ALT\"
 2665                                        AND table_parquet.\"REF\" = table_variants.\"REF\"
 2666                                        AND table_parquet.INFO NOT IN ('','.')
 2667                            )
 2668                        )
 2669            ;
 2670            """
 2671        self.conn.execute(sql_query_update)
 2672
 2673    def update_from_vcf_sqlite(self, vcf_file: str) -> None:
 2674        """
 2675        It creates a temporary table in the SQLite database, loads the VCF file into the temporary
 2676        table, then updates the INFO column of the variants table with the INFO column of the temporary
 2677        table
 2678
 2679        :param vcf_file: The path to the VCF file you want to update the database with
 2680        """
 2681
 2682        # Create a temporary table for the VCF
 2683        table_vcf = "tmp_vcf"
 2684        sql_create = (
 2685            f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0"
 2686        )
 2687        self.conn.execute(sql_create)
 2688
 2689        # Loading VCF into temporaire table
 2690        vcf_df = pd.read_csv(
 2691            vcf_file, sep="\t", comment="#", header=None, low_memory=False
 2692        )
 2693        vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"]
 2694        vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False)
 2695
 2696        # Update table 'variants' with VCF data
 2697        # warning: CONCAT as || operator
 2698        sql_query_update = f"""
 2699            UPDATE variants as table_variants
 2700            SET INFO = CASE
 2701                            WHEN INFO NOT IN ('', '.')
 2702                            THEN INFO
 2703                            ELSE ''
 2704                        END ||
 2705                        (
 2706                        SELECT 
 2707                            CASE 
 2708                                WHEN table_variants.INFO NOT IN ('','.') 
 2709                                    AND table_vcf.INFO NOT IN ('','.')  
 2710                                THEN ';' 
 2711                                ELSE '' 
 2712                            END || 
 2713                            CASE 
 2714                                WHEN table_vcf.INFO NOT IN ('','.') 
 2715                                THEN table_vcf.INFO 
 2716                                ELSE '' 
 2717                            END
 2718                        FROM {table_vcf} as table_vcf
 2719                        WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\"
 2720                            AND table_vcf.\"POS\" = table_variants.\"POS\"
 2721                            AND table_vcf.\"ALT\" = table_variants.\"ALT\"
 2722                            AND table_vcf.\"REF\" = table_variants.\"REF\"
 2723                        )
 2724        """
 2725        self.conn.execute(sql_query_update)
 2726
 2727        # Drop temporary table
 2728        sql_drop = f"DROP TABLE {table_vcf}"
 2729        self.conn.execute(sql_drop)
 2730
 2731    def drop_variants_table(self) -> None:
 2732        """
 2733        > This function drops the variants table
 2734        """
 2735
 2736        table_variants = self.get_table_variants()
 2737        sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}"
 2738        self.conn.execute(sql_table_variants)
 2739
 2740    def set_variant_id(
 2741        self, variant_id_column: str = "variant_id", force: bool = None
 2742    ) -> str:
 2743        """
 2744        It adds a column to the variants table called `variant_id` and populates it with a hash of the
 2745        `#CHROM`, `POS`, `REF`, and `ALT` columns
 2746
 2747        :param variant_id_column: The name of the column to be created in the variants table, defaults
 2748        to variant_id
 2749        :type variant_id_column: str (optional)
 2750        :param force: If True, the variant_id column will be created even if it already exists
 2751        :type force: bool
 2752        :return: The name of the column that contains the variant_id
 2753        """
 2754
 2755        # Assembly
 2756        assembly = self.get_param().get(
 2757            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 2758        )
 2759
 2760        # INFO/Tag prefix
 2761        prefix = self.get_explode_infos_prefix()
 2762
 2763        # Explode INFO/SVTYPE
 2764        added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"])
 2765
 2766        # variants table
 2767        table_variants = self.get_table_variants()
 2768
 2769        # variant_id column
 2770        if not variant_id_column:
 2771            variant_id_column = "variant_id"
 2772
 2773        # Creta variant_id column
 2774        if "variant_id" not in self.get_extra_infos() or force:
 2775
 2776            # Create column
 2777            self.add_column(
 2778                table_name=table_variants,
 2779                column_name=variant_id_column,
 2780                column_type="UBIGINT",
 2781                default_value="0",
 2782            )
 2783
 2784            # Update column
 2785            self.conn.execute(
 2786                f"""
 2787                    UPDATE {table_variants}
 2788                    SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"')
 2789                """
 2790            )
 2791
 2792        # Remove added columns
 2793        for added_column in added_columns:
 2794            self.drop_column(column=added_column)
 2795
 2796        # return variant_id column name
 2797        return variant_id_column
 2798
 2799    def get_variant_id_column(
 2800        self, variant_id_column: str = "variant_id", force: bool = None
 2801    ) -> str:
 2802        """
 2803        This function returns the variant_id column name
 2804
 2805        :param variant_id_column: The name of the column in the dataframe that contains the variant IDs,
 2806        defaults to variant_id
 2807        :type variant_id_column: str (optional)
 2808        :param force: If True, will force the variant_id to be set to the value of variant_id_column. If
 2809        False, will only set the variant_id if it is not already set. If None, will set the variant_id
 2810        if it is not already set, or if it is set
 2811        :type force: bool
 2812        :return: The variant_id column name.
 2813        """
 2814
 2815        return self.set_variant_id(variant_id_column=variant_id_column, force=force)
 2816
 2817    ###
 2818    # Annotation
 2819    ###
 2820
 2821    def scan_databases(
 2822        self,
 2823        database_formats: list = ["parquet"],
 2824        database_releases: list = ["current"],
 2825    ) -> dict:
 2826        """
 2827        The function `scan_databases` scans for available databases based on specified formats and
 2828        releases.
 2829
 2830        :param database_formats: The `database_formats` parameter is a list that specifies the formats
 2831        of the databases to be scanned. In this case, the accepted format is "parquet"
 2832        :type database_formats: list ["parquet"]
 2833        :param database_releases: The `database_releases` parameter is a list that specifies the
 2834        releases of the databases to be scanned. In the provided function, the default value for
 2835        `database_releases` is set to `["current"]`, meaning that by default, the function will scan
 2836        databases that are in the "current"
 2837        :type database_releases: list
 2838        :return: The function `scan_databases` returns a dictionary containing information about
 2839        databases that match the specified formats and releases.
 2840        """
 2841
 2842        # Config
 2843        config = self.get_config()
 2844
 2845        # Param
 2846        param = self.get_param()
 2847
 2848        # Param - Assembly
 2849        assembly = param.get("assembly", config.get("assembly", None))
 2850        if not assembly:
 2851            assembly = DEFAULT_ASSEMBLY
 2852            log.warning(f"Default assembly '{assembly}'")
 2853
 2854        # Scan for availabled databases
 2855        log.info(
 2856            f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..."
 2857        )
 2858        databases_infos_dict = databases_infos(
 2859            database_folder_releases=database_releases,
 2860            database_formats=database_formats,
 2861            assembly=assembly,
 2862            config=config,
 2863        )
 2864        log.info(
 2865            f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found"
 2866        )
 2867
 2868        return databases_infos_dict
 2869
 2870    def annotation(self) -> None:
 2871        """
 2872        It annotates the VCF file with the annotations specified in the config file.
 2873        """
 2874
 2875        # Config
 2876        config = self.get_config()
 2877
 2878        # Param
 2879        param = self.get_param()
 2880
 2881        # Param - Assembly
 2882        assembly = param.get("assembly", config.get("assembly", None))
 2883        if not assembly:
 2884            assembly = DEFAULT_ASSEMBLY
 2885            log.warning(f"Default assembly '{assembly}'")
 2886
 2887        # annotations databases folders
 2888        annotations_databases = set(
 2889            config.get("folders", {})
 2890            .get("databases", {})
 2891            .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER])
 2892            + config.get("folders", {})
 2893            .get("databases", {})
 2894            .get("parquet", ["~/howard/databases/parquet/current"])
 2895            + config.get("folders", {})
 2896            .get("databases", {})
 2897            .get("bcftools", ["~/howard/databases/bcftools/current"])
 2898        )
 2899
 2900        # Get param annotations
 2901        if param.get("annotations", None) and isinstance(
 2902            param.get("annotations", None), str
 2903        ):
 2904            log.debug(param.get("annotations", None))
 2905            param_annotation_list = param.get("annotations").split(",")
 2906        else:
 2907            param_annotation_list = []
 2908
 2909        # Each tools param
 2910        if param.get("annotation_parquet", None) != None:
 2911            log.debug(
 2912                f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}"""
 2913            )
 2914            if isinstance(param.get("annotation_parquet", None), list):
 2915                param_annotation_list.append(",".join(param.get("annotation_parquet")))
 2916            else:
 2917                param_annotation_list.append(param.get("annotation_parquet"))
 2918        if param.get("annotation_snpsift", None) != None:
 2919            if isinstance(param.get("annotation_snpsift", None), list):
 2920                param_annotation_list.append(
 2921                    "snpsift:"
 2922                    + "+".join(param.get("annotation_snpsift")).replace(",", "+")
 2923                )
 2924            else:
 2925                param_annotation_list.append(
 2926                    "snpsift:" + param.get("annotation_snpsift").replace(",", "+")
 2927                )
 2928        if param.get("annotation_snpeff", None) != None:
 2929            param_annotation_list.append("snpeff:" + param.get("annotation_snpeff"))
 2930        if param.get("annotation_bcftools", None) != None:
 2931            if isinstance(param.get("annotation_bcftools", None), list):
 2932                param_annotation_list.append(
 2933                    "bcftools:"
 2934                    + "+".join(param.get("annotation_bcftools")).replace(",", "+")
 2935                )
 2936            else:
 2937                param_annotation_list.append(
 2938                    "bcftools:" + param.get("annotation_bcftools").replace(",", "+")
 2939                )
 2940        if param.get("annotation_annovar", None) != None:
 2941            param_annotation_list.append("annovar:" + param.get("annotation_annovar"))
 2942        if param.get("annotation_exomiser", None) != None:
 2943            param_annotation_list.append("exomiser:" + param.get("annotation_exomiser"))
 2944        if param.get("annotation_splice", None) != None:
 2945            param_annotation_list.append("splice:" + param.get("annotation_splice"))
 2946
 2947        # Merge param annotations list
 2948        param["annotations"] = ",".join(param_annotation_list)
 2949
 2950        # debug
 2951        log.debug(f"param_annotations={param['annotations']}")
 2952
 2953        if param.get("annotations"):
 2954
 2955            # Log
 2956            # log.info("Annotations - Check annotation parameters")
 2957
 2958            if not "annotation" in param:
 2959                param["annotation"] = {}
 2960
 2961            # List of annotations parameters
 2962            annotations_list_input = {}
 2963            if isinstance(param.get("annotations", None), str):
 2964                annotation_file_list = [
 2965                    value for value in param.get("annotations", "").split(",")
 2966                ]
 2967                for annotation_file in annotation_file_list:
 2968                    annotations_list_input[annotation_file.strip()] = {"INFO": None}
 2969            else:
 2970                annotations_list_input = param.get("annotations", {})
 2971
 2972            log.info(f"Quick Annotations:")
 2973            for annotation_key in list(annotations_list_input.keys()):
 2974                log.info(f"   {annotation_key}")
 2975
 2976            # List of annotations and associated fields
 2977            annotations_list = {}
 2978
 2979            for annotation_file in annotations_list_input:
 2980
 2981                # Explode annotations if ALL
 2982                if (
 2983                    annotation_file.upper() == "ALL"
 2984                    or annotation_file.upper().startswith("ALL:")
 2985                ):
 2986
 2987                    # check ALL parameters (formats, releases)
 2988                    annotation_file_split = annotation_file.split(":")
 2989                    database_formats = "parquet"
 2990                    database_releases = "current"
 2991                    for annotation_file_option in annotation_file_split[1:]:
 2992                        database_all_options_split = annotation_file_option.split("=")
 2993                        if database_all_options_split[0] == "format":
 2994                            database_formats = database_all_options_split[1].split("+")
 2995                        if database_all_options_split[0] == "release":
 2996                            database_releases = database_all_options_split[1].split("+")
 2997
 2998                    # Scan for availabled databases
 2999                    databases_infos_dict = self.scan_databases(
 3000                        database_formats=database_formats,
 3001                        database_releases=database_releases,
 3002                    )
 3003
 3004                    # Add found databases in annotation parameters
 3005                    for database_infos in databases_infos_dict.keys():
 3006                        annotations_list[database_infos] = {"INFO": None}
 3007
 3008                else:
 3009                    annotations_list[annotation_file] = annotations_list_input[
 3010                        annotation_file
 3011                    ]
 3012
 3013            # Check each databases
 3014            if len(annotations_list):
 3015
 3016                log.info(
 3017                    f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..."
 3018                )
 3019
 3020                for annotation_file in annotations_list:
 3021
 3022                    # Init
 3023                    annotations = annotations_list.get(annotation_file, None)
 3024
 3025                    # Annotation snpEff
 3026                    if annotation_file.startswith("snpeff"):
 3027
 3028                        log.debug(f"Quick Annotation snpEff")
 3029
 3030                        if "snpeff" not in param["annotation"]:
 3031                            param["annotation"]["snpeff"] = {}
 3032
 3033                        if "options" not in param["annotation"]["snpeff"]:
 3034                            param["annotation"]["snpeff"]["options"] = ""
 3035
 3036                        # snpEff options in annotations
 3037                        param["annotation"]["snpeff"]["options"] = "".join(
 3038                            annotation_file.split(":")[1:]
 3039                        )
 3040
 3041                    # Annotation Annovar
 3042                    elif annotation_file.startswith("annovar"):
 3043
 3044                        log.debug(f"Quick Annotation Annovar")
 3045
 3046                        if "annovar" not in param["annotation"]:
 3047                            param["annotation"]["annovar"] = {}
 3048
 3049                        if "annotations" not in param["annotation"]["annovar"]:
 3050                            param["annotation"]["annovar"]["annotations"] = {}
 3051
 3052                        # Options
 3053                        annotation_file_split = annotation_file.split(":")
 3054                        for annotation_file_annotation in annotation_file_split[1:]:
 3055                            if annotation_file_annotation:
 3056                                param["annotation"]["annovar"]["annotations"][
 3057                                    annotation_file_annotation
 3058                                ] = annotations
 3059
 3060                    # Annotation Exomiser
 3061                    elif annotation_file.startswith("exomiser"):
 3062
 3063                        log.debug(f"Quick Annotation Exomiser")
 3064
 3065                        param["annotation"]["exomiser"] = params_string_to_dict(
 3066                            annotation_file
 3067                        )
 3068
 3069                    # Annotation Splice
 3070                    elif annotation_file.startswith("splice"):
 3071
 3072                        log.debug(f"Quick Annotation Splice")
 3073
 3074                        param["annotation"]["splice"] = params_string_to_dict(
 3075                            annotation_file
 3076                        )
 3077
 3078                    # Annotation Parquet or BCFTOOLS
 3079                    else:
 3080
 3081                        # Tools detection
 3082                        if annotation_file.startswith("bcftools:"):
 3083                            annotation_tool_initial = "bcftools"
 3084                            annotation_file = ":".join(annotation_file.split(":")[1:])
 3085                        elif annotation_file.startswith("snpsift:"):
 3086                            annotation_tool_initial = "snpsift"
 3087                            annotation_file = ":".join(annotation_file.split(":")[1:])
 3088                        elif annotation_file.startswith("bigwig:"):
 3089                            annotation_tool_initial = "bigwig"
 3090                            annotation_file = ":".join(annotation_file.split(":")[1:])
 3091                        else:
 3092                            annotation_tool_initial = None
 3093
 3094                        # list of files
 3095                        annotation_file_list = annotation_file.replace("+", ":").split(
 3096                            ":"
 3097                        )
 3098
 3099                        for annotation_file in annotation_file_list:
 3100
 3101                            if annotation_file:
 3102
 3103                                # Annotation tool initial
 3104                                annotation_tool = annotation_tool_initial
 3105
 3106                                # Find file
 3107                                annotation_file_found = None
 3108
 3109                                if os.path.exists(annotation_file):
 3110                                    annotation_file_found = annotation_file
 3111                                elif os.path.exists(full_path(annotation_file)):
 3112                                    annotation_file_found = full_path(annotation_file)
 3113                                else:
 3114                                    # Find within assembly folders
 3115                                    for annotations_database in annotations_databases:
 3116                                        found_files = find_all(
 3117                                            annotation_file,
 3118                                            os.path.join(
 3119                                                annotations_database, assembly
 3120                                            ),
 3121                                        )
 3122                                        if len(found_files) > 0:
 3123                                            annotation_file_found = found_files[0]
 3124                                            break
 3125                                    if not annotation_file_found and not assembly:
 3126                                        # Find within folders
 3127                                        for (
 3128                                            annotations_database
 3129                                        ) in annotations_databases:
 3130                                            found_files = find_all(
 3131                                                annotation_file, annotations_database
 3132                                            )
 3133                                            if len(found_files) > 0:
 3134                                                annotation_file_found = found_files[0]
 3135                                                break
 3136                                log.debug(
 3137                                    f"for {annotation_file} annotation_file_found={annotation_file_found}"
 3138                                )
 3139
 3140                                # Full path
 3141                                annotation_file_found = full_path(annotation_file_found)
 3142
 3143                                if annotation_file_found:
 3144
 3145                                    database = Database(database=annotation_file_found)
 3146                                    quick_annotation_format = database.get_format()
 3147                                    quick_annotation_is_compressed = (
 3148                                        database.is_compressed()
 3149                                    )
 3150                                    quick_annotation_is_indexed = os.path.exists(
 3151                                        f"{annotation_file_found}.tbi"
 3152                                    )
 3153                                    bcftools_preference = False
 3154
 3155                                    # Check Annotation Tool
 3156                                    if not annotation_tool:
 3157                                        if (
 3158                                            bcftools_preference
 3159                                            and quick_annotation_format
 3160                                            in ["vcf", "bed"]
 3161                                            and quick_annotation_is_compressed
 3162                                            and quick_annotation_is_indexed
 3163                                        ):
 3164                                            annotation_tool = "bcftools"
 3165                                        elif quick_annotation_format in [
 3166                                            "vcf",
 3167                                            "bed",
 3168                                            "tsv",
 3169                                            "tsv",
 3170                                            "csv",
 3171                                            "json",
 3172                                            "tbl",
 3173                                            "parquet",
 3174                                            "duckdb",
 3175                                        ]:
 3176                                            annotation_tool = "parquet"
 3177                                        elif quick_annotation_format in ["bw"]:
 3178                                            annotation_tool = "bigwig"
 3179                                        else:
 3180                                            log.error(
 3181                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
 3182                                            )
 3183                                            raise ValueError(
 3184                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
 3185                                            )
 3186
 3187                                    log.debug(
 3188                                        f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}"
 3189                                    )
 3190
 3191                                    # Annotation Tool dispatch
 3192                                    if annotation_tool:
 3193                                        if annotation_tool not in param["annotation"]:
 3194                                            param["annotation"][annotation_tool] = {}
 3195                                        if (
 3196                                            "annotations"
 3197                                            not in param["annotation"][annotation_tool]
 3198                                        ):
 3199                                            param["annotation"][annotation_tool][
 3200                                                "annotations"
 3201                                            ] = {}
 3202                                        param["annotation"][annotation_tool][
 3203                                            "annotations"
 3204                                        ][annotation_file_found] = annotations
 3205
 3206                                else:
 3207                                    log.warning(
 3208                                        f"Quick Annotation File {annotation_file} does NOT exist"
 3209                                    )
 3210
 3211                self.set_param(param)
 3212
 3213        if param.get("annotation", None):
 3214            log.info("Annotations")
 3215            if param.get("annotation", {}).get("parquet", None):
 3216                log.info("Annotations 'parquet'...")
 3217                self.annotation_parquet()
 3218            if param.get("annotation", {}).get("bcftools", None):
 3219                log.info("Annotations 'bcftools'...")
 3220                self.annotation_bcftools()
 3221            if param.get("annotation", {}).get("snpsift", None):
 3222                log.info("Annotations 'snpsift'...")
 3223                self.annotation_snpsift()
 3224            if param.get("annotation", {}).get("bigwig", None):
 3225                log.info("Annotations 'bigwig'...")
 3226                self.annotation_bigwig()
 3227            if param.get("annotation", {}).get("annovar", None):
 3228                log.info("Annotations 'annovar'...")
 3229                self.annotation_annovar()
 3230            if param.get("annotation", {}).get("snpeff", None):
 3231                log.info("Annotations 'snpeff'...")
 3232                self.annotation_snpeff()
 3233            if param.get("annotation", {}).get("exomiser", None) is not None:
 3234                log.info("Annotations 'exomiser'...")
 3235                self.annotation_exomiser()
 3236            if param.get("annotation", {}).get("splice", None) is not None:
 3237                log.info("Annotations 'splice' ...")
 3238                self.annotation_splice()
 3239
 3240        # Explode INFOS fields into table fields
 3241        if self.get_explode_infos():
 3242            self.explode_infos(
 3243                prefix=self.get_explode_infos_prefix(),
 3244                fields=self.get_explode_infos_fields(),
 3245                force=True,
 3246            )
 3247
 3248    def annotation_bigwig(self, threads: int = None) -> None:
 3249        """
 3250        The function `annotation_bigwig` annotates variants in a VCF file using bigwig databases.
 3251
 3252        :param threads: The `threads` parameter in the `annotation_bigwig` method is used to specify the
 3253        number of threads to be used for parallel processing during the annotation process. If the
 3254        `threads` parameter is not provided, the method will attempt to determine the optimal number of
 3255        threads to use based on the system configuration
 3256        :type threads: int
 3257        :return: True
 3258        """
 3259
 3260        # DEBUG
 3261        log.debug("Start annotation with bigwig databases")
 3262
 3263        # # Threads
 3264        # if not threads:
 3265        #     threads = self.get_threads()
 3266        # log.debug("Threads: " + str(threads))
 3267
 3268        # Config
 3269        config = self.get_config()
 3270        log.debug("Config: " + str(config))
 3271
 3272        # Config - BCFTools databases folders
 3273        databases_folders = set(
 3274            self.get_config()
 3275            .get("folders", {})
 3276            .get("databases", {})
 3277            .get("annotations", ["."])
 3278            + self.get_config()
 3279            .get("folders", {})
 3280            .get("databases", {})
 3281            .get("bigwig", ["."])
 3282        )
 3283        log.debug("Databases annotations: " + str(databases_folders))
 3284
 3285        # Param
 3286        annotations = (
 3287            self.get_param()
 3288            .get("annotation", {})
 3289            .get("bigwig", {})
 3290            .get("annotations", None)
 3291        )
 3292        log.debug("Annotations: " + str(annotations))
 3293
 3294        # Assembly
 3295        assembly = self.get_param().get(
 3296            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 3297        )
 3298
 3299        # Data
 3300        table_variants = self.get_table_variants()
 3301
 3302        # Check if not empty
 3303        log.debug("Check if not empty")
 3304        sql_query_chromosomes = (
 3305            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 3306        )
 3307        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 3308        if not sql_query_chromosomes_df["count"][0]:
 3309            log.info(f"VCF empty")
 3310            return
 3311
 3312        # VCF header
 3313        vcf_reader = self.get_header()
 3314        log.debug("Initial header: " + str(vcf_reader.infos))
 3315
 3316        # Existing annotations
 3317        for vcf_annotation in self.get_header().infos:
 3318
 3319            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 3320            log.debug(
 3321                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 3322            )
 3323
 3324        if annotations:
 3325
 3326            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
 3327
 3328                # Export VCF file
 3329                tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz")
 3330
 3331                # annotation_bigwig_config
 3332                annotation_bigwig_config_list = []
 3333
 3334                for annotation in annotations:
 3335                    annotation_fields = annotations[annotation]
 3336
 3337                    # Annotation Name
 3338                    annotation_name = os.path.basename(annotation)
 3339
 3340                    if not annotation_fields:
 3341                        annotation_fields = {"INFO": None}
 3342
 3343                    log.debug(f"Annotation '{annotation_name}'")
 3344                    log.debug(
 3345                        f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 3346                    )
 3347
 3348                    # Create Database
 3349                    database = Database(
 3350                        database=annotation,
 3351                        databases_folders=databases_folders,
 3352                        assembly=assembly,
 3353                    )
 3354
 3355                    # Find files
 3356                    db_file = database.get_database()
 3357                    db_file = full_path(db_file)
 3358                    db_hdr_file = database.get_header_file()
 3359                    db_hdr_file = full_path(db_hdr_file)
 3360                    db_file_type = database.get_format()
 3361
 3362                    # If db_file is http ?
 3363                    if database.get_database().startswith("http"):
 3364
 3365                        # Datbase is HTTP URL
 3366                        db_file_is_http = True
 3367
 3368                        # DB file keep as URL
 3369                        db_file = database.get_database()
 3370                        log.warning(
 3371                            f"Annotations 'bigwig' database '{db_file}' - is an HTTP URL (experimental)"
 3372                        )
 3373
 3374                        # Retrieve automatic annotation field name
 3375                        annotation_field = clean_annotation_field(
 3376                            os.path.basename(db_file).replace(".bw", "")
 3377                        )
 3378                        log.debug(
 3379                            f"Create header file with annotation field '{annotation_field}' is an HTTP URL"
 3380                        )
 3381
 3382                        # Create automatic header file
 3383                        db_hdr_file = os.path.join(tmp_dir, "header.hdr")
 3384                        with open(db_hdr_file, "w") as f:
 3385                            f.write("##fileformat=VCFv4.2\n")
 3386                            f.write(
 3387                                f"""##INFO=<ID={annotation_field},Number=.,Type=Float,Description="{annotation_field} annotation from {db_file}">\n"""
 3388                            )
 3389                            f.write(f"#CHROM	START	END	{annotation_field}\n")
 3390
 3391                    else:
 3392
 3393                        # Datbase is NOT HTTP URL
 3394                        db_file_is_http = False
 3395
 3396                    # Check index - try to create if not exists
 3397                    if (
 3398                        db_file is None
 3399                        or db_hdr_file is None
 3400                        or (not os.path.exists(db_file) and not db_file_is_http)
 3401                        or not os.path.exists(db_hdr_file)
 3402                        or not db_file_type in ["bw"]
 3403                    ):
 3404                        # if False:
 3405                        log.error("Annotation failed: database not valid")
 3406                        log.error(f"Annotation annotation file: {db_file}")
 3407                        log.error(f"Annotation annotation file type: {db_file_type}")
 3408                        log.error(f"Annotation annotation header: {db_hdr_file}")
 3409                        raise ValueError(
 3410                            f"Annotation failed: database not valid - annotation file {db_file} / annotation file type {db_file_type} / annotation header {db_hdr_file}"
 3411                        )
 3412                    else:
 3413
 3414                        # Log
 3415                        log.debug(
 3416                            f"Annotation '{annotation}' - file: "
 3417                            + str(db_file)
 3418                            + " and "
 3419                            + str(db_hdr_file)
 3420                        )
 3421
 3422                        # Load header as VCF object
 3423                        db_hdr_vcf = Variants(input=db_hdr_file)
 3424                        db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
 3425                        log.debug(
 3426                            "Annotation database header: "
 3427                            + str(db_hdr_vcf_header_infos)
 3428                        )
 3429
 3430                        # For all fields in database
 3431                        annotation_fields_full = False
 3432                        if "ALL" in annotation_fields or "INFO" in annotation_fields:
 3433                            annotation_fields = {
 3434                                key: key for key in db_hdr_vcf_header_infos
 3435                            }
 3436                            log.debug(
 3437                                "Annotation database header - All annotations added: "
 3438                                + str(annotation_fields)
 3439                            )
 3440                            annotation_fields_full = True
 3441
 3442                        # Init
 3443                        cyvcf2_header_rename_dict = {}
 3444                        cyvcf2_header_list = []
 3445                        cyvcf2_header_indexes = {}
 3446
 3447                        # process annotation fields
 3448                        for annotation_field in annotation_fields:
 3449
 3450                            # New annotation name
 3451                            annotation_field_new = annotation_fields[annotation_field]
 3452
 3453                            # Check annotation field and index in header
 3454                            if (
 3455                                annotation_field
 3456                                in db_hdr_vcf.get_header_columns_as_list()
 3457                            ):
 3458                                annotation_field_index = (
 3459                                    db_hdr_vcf.get_header_columns_as_list().index(
 3460                                        annotation_field
 3461                                    )
 3462                                    - 3
 3463                                )
 3464                                cyvcf2_header_indexes[annotation_field_new] = (
 3465                                    annotation_field_index
 3466                                )
 3467                            else:
 3468                                msg_err = f"Database '{db_file}' does NOT contain annotation field '{annotation_field}'"
 3469                                log.error(msg_err)
 3470                                raise ValueError(msg_err)
 3471
 3472                            # Append annotation field in cyvcf2 header list
 3473                            cyvcf2_header_rename_dict[annotation_field_new] = (
 3474                                db_hdr_vcf_header_infos[annotation_field].id
 3475                            )
 3476                            cyvcf2_header_list.append(
 3477                                {
 3478                                    "ID": annotation_field_new,
 3479                                    "Number": db_hdr_vcf_header_infos[
 3480                                        annotation_field
 3481                                    ].num,
 3482                                    "Type": db_hdr_vcf_header_infos[
 3483                                        annotation_field
 3484                                    ].type,
 3485                                    "Description": db_hdr_vcf_header_infos[
 3486                                        annotation_field
 3487                                    ].desc,
 3488                                }
 3489                            )
 3490
 3491                            # Add header on VCF
 3492                            vcf_reader.infos[annotation_field_new] = vcf.parser._Info(
 3493                                annotation_field_new,
 3494                                db_hdr_vcf_header_infos[annotation_field].num,
 3495                                db_hdr_vcf_header_infos[annotation_field].type,
 3496                                db_hdr_vcf_header_infos[annotation_field].desc,
 3497                                "HOWARD BigWig annotation",
 3498                                "unknown",
 3499                                self.code_type_map[
 3500                                    db_hdr_vcf_header_infos[annotation_field].type
 3501                                ],
 3502                            )
 3503
 3504                        # Load bigwig database
 3505                        bw_db = pyBigWig.open(db_file)
 3506                        if bw_db.isBigWig():
 3507                            log.debug(f"Database '{db_file}' is in 'BigWig' format")
 3508                        else:
 3509                            msg_err = f"Database '{db_file}' is NOT in 'BigWig' format"
 3510                            log.error(msg_err)
 3511                            raise ValueError(msg_err)
 3512
 3513                        annotation_bigwig_config_list.append(
 3514                            {
 3515                                "db_file": db_file,
 3516                                "bw_db": bw_db,
 3517                                "cyvcf2_header_rename_dict": cyvcf2_header_rename_dict,
 3518                                "cyvcf2_header_list": cyvcf2_header_list,
 3519                                "cyvcf2_header_indexes": cyvcf2_header_indexes,
 3520                            }
 3521                        )
 3522
 3523                # Annotate
 3524                if annotation_bigwig_config_list:
 3525
 3526                    # Annotation config
 3527                    log.debug(
 3528                        f"annotation_bigwig_config={annotation_bigwig_config_list}"
 3529                    )
 3530
 3531                    # Export VCF file
 3532                    self.export_variant_vcf(
 3533                        vcf_file=tmp_vcf_name,
 3534                        remove_info=True,
 3535                        add_samples=False,
 3536                        index=True,
 3537                    )
 3538
 3539                    # Load input tmp file
 3540                    input_vcf = cyvcf2.VCF(tmp_vcf_name)
 3541
 3542                    # Add header in input file
 3543                    for annotation_bigwig_config in annotation_bigwig_config_list:
 3544                        for cyvcf2_header_field in annotation_bigwig_config.get(
 3545                            "cyvcf2_header_list", []
 3546                        ):
 3547                            log.info(
 3548                                f"Annotations 'bigwig' database '{os.path.basename(annotation_bigwig_config.get('db_file'))}' - annotation field '{annotation_bigwig_config.get('cyvcf2_header_rename_dict',{}).get(cyvcf2_header_field.get('ID','Unknown'))}' -> '{cyvcf2_header_field.get('ID')}'"
 3549                            )
 3550                            input_vcf.add_info_to_header(cyvcf2_header_field)
 3551
 3552                    # Create output VCF file
 3553                    output_vcf_file = os.path.join(tmp_dir, "output.vcf.gz")
 3554                    output_vcf = cyvcf2.Writer(output_vcf_file, input_vcf)
 3555
 3556                    # Fetch variants
 3557                    log.info(f"Annotations 'bigwig' start...")
 3558                    for variant in input_vcf:
 3559
 3560                        for annotation_bigwig_config in annotation_bigwig_config_list:
 3561
 3562                            # DB and indexes
 3563                            bw_db = annotation_bigwig_config.get("bw_db", None)
 3564                            cyvcf2_header_indexes = annotation_bigwig_config.get(
 3565                                "cyvcf2_header_indexes", None
 3566                            )
 3567
 3568                            # Retrieve value from chrom pos
 3569                            res = bw_db.values(
 3570                                variant.CHROM, variant.POS - 1, variant.POS
 3571                            )
 3572
 3573                            # For each annotation fields (and indexes)
 3574                            for cyvcf2_header_index in cyvcf2_header_indexes:
 3575
 3576                                # If value is NOT nNone
 3577                                if not np.isnan(
 3578                                    res[cyvcf2_header_indexes[cyvcf2_header_index]]
 3579                                ):
 3580                                    variant.INFO[cyvcf2_header_index] = res[
 3581                                        cyvcf2_header_indexes[cyvcf2_header_index]
 3582                                    ]
 3583
 3584                        # Add record in output file
 3585                        output_vcf.write_record(variant)
 3586
 3587                    # Log
 3588                    log.debug(f"Annotation done.")
 3589
 3590                    # Close and write file
 3591                    log.info(f"Annotations 'bigwig' write...")
 3592                    output_vcf.close()
 3593                    log.debug(f"Write done.")
 3594
 3595                    # Update variants
 3596                    log.info(f"Annotations 'bigwig' update...")
 3597                    self.update_from_vcf(output_vcf_file)
 3598                    log.debug(f"Update done.")
 3599
 3600        return True
 3601
 3602    def annotation_snpsift(self, threads: int = None) -> None:
 3603        """
 3604        This function annotate with bcftools
 3605
 3606        :param threads: Number of threads to use
 3607        :return: the value of the variable "return_value".
 3608        """
 3609
 3610        # DEBUG
 3611        log.debug("Start annotation with bcftools databases")
 3612
 3613        # Threads
 3614        if not threads:
 3615            threads = self.get_threads()
 3616        log.debug("Threads: " + str(threads))
 3617
 3618        # Config
 3619        config = self.get_config()
 3620        log.debug("Config: " + str(config))
 3621
 3622        # Config - snpSift
 3623        snpsift_bin_command = get_bin_command(
 3624            bin="SnpSift.jar",
 3625            tool="snpsift",
 3626            bin_type="jar",
 3627            config=config,
 3628            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
 3629        )
 3630        if not snpsift_bin_command:
 3631            msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'"
 3632            log.error(msg_err)
 3633            raise ValueError(msg_err)
 3634
 3635        # Config - bcftools
 3636        bcftools_bin_command = get_bin_command(
 3637            bin="bcftools",
 3638            tool="bcftools",
 3639            bin_type="bin",
 3640            config=config,
 3641            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
 3642        )
 3643        if not bcftools_bin_command:
 3644            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
 3645            log.error(msg_err)
 3646            raise ValueError(msg_err)
 3647
 3648        # Config - BCFTools databases folders
 3649        databases_folders = set(
 3650            self.get_config()
 3651            .get("folders", {})
 3652            .get("databases", {})
 3653            .get("annotations", ["."])
 3654            + self.get_config()
 3655            .get("folders", {})
 3656            .get("databases", {})
 3657            .get("bcftools", ["."])
 3658        )
 3659        log.debug("Databases annotations: " + str(databases_folders))
 3660
 3661        # Param
 3662        annotations = (
 3663            self.get_param()
 3664            .get("annotation", {})
 3665            .get("snpsift", {})
 3666            .get("annotations", None)
 3667        )
 3668        log.debug("Annotations: " + str(annotations))
 3669
 3670        # Assembly
 3671        assembly = self.get_param().get(
 3672            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 3673        )
 3674
 3675        # Data
 3676        table_variants = self.get_table_variants()
 3677
 3678        # Check if not empty
 3679        log.debug("Check if not empty")
 3680        sql_query_chromosomes = (
 3681            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 3682        )
 3683        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 3684        if not sql_query_chromosomes_df["count"][0]:
 3685            log.info(f"VCF empty")
 3686            return
 3687
 3688        # VCF header
 3689        vcf_reader = self.get_header()
 3690        log.debug("Initial header: " + str(vcf_reader.infos))
 3691
 3692        # Existing annotations
 3693        for vcf_annotation in self.get_header().infos:
 3694
 3695            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 3696            log.debug(
 3697                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 3698            )
 3699
 3700        if annotations:
 3701
 3702            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
 3703
 3704                # Export VCF file
 3705                tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz")
 3706
 3707                # Init
 3708                commands = {}
 3709
 3710                for annotation in annotations:
 3711                    annotation_fields = annotations[annotation]
 3712
 3713                    # Annotation Name
 3714                    annotation_name = os.path.basename(annotation)
 3715
 3716                    if not annotation_fields:
 3717                        annotation_fields = {"INFO": None}
 3718
 3719                    log.debug(f"Annotation '{annotation_name}'")
 3720                    log.debug(
 3721                        f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 3722                    )
 3723
 3724                    # Create Database
 3725                    database = Database(
 3726                        database=annotation,
 3727                        databases_folders=databases_folders,
 3728                        assembly=assembly,
 3729                    )
 3730
 3731                    # Find files
 3732                    db_file = database.get_database()
 3733                    db_file = full_path(db_file)
 3734                    db_hdr_file = database.get_header_file()
 3735                    db_hdr_file = full_path(db_hdr_file)
 3736                    db_file_type = database.get_format()
 3737                    db_tbi_file = f"{db_file}.tbi"
 3738                    db_file_compressed = database.is_compressed()
 3739
 3740                    # Check if compressed
 3741                    if not db_file_compressed:
 3742                        log.error(
 3743                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
 3744                        )
 3745                        raise ValueError(
 3746                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
 3747                        )
 3748
 3749                    # Check if indexed
 3750                    if not os.path.exists(db_tbi_file):
 3751                        log.error(
 3752                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
 3753                        )
 3754                        raise ValueError(
 3755                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
 3756                        )
 3757
 3758                    # Check index - try to create if not exists
 3759                    if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
 3760                        log.error("Annotation failed: database not valid")
 3761                        log.error(f"Annotation annotation file: {db_file}")
 3762                        log.error(f"Annotation annotation header: {db_hdr_file}")
 3763                        log.error(f"Annotation annotation index: {db_tbi_file}")
 3764                        raise ValueError(
 3765                            f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
 3766                        )
 3767                    else:
 3768
 3769                        log.debug(
 3770                            f"Annotation '{annotation}' - file: "
 3771                            + str(db_file)
 3772                            + " and "
 3773                            + str(db_hdr_file)
 3774                        )
 3775
 3776                        # Load header as VCF object
 3777                        db_hdr_vcf = Variants(input=db_hdr_file)
 3778                        db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
 3779                        log.debug(
 3780                            "Annotation database header: "
 3781                            + str(db_hdr_vcf_header_infos)
 3782                        )
 3783
 3784                        # For all fields in database
 3785                        annotation_fields_full = False
 3786                        if "ALL" in annotation_fields or "INFO" in annotation_fields:
 3787                            annotation_fields = {
 3788                                key: key for key in db_hdr_vcf_header_infos
 3789                            }
 3790                            log.debug(
 3791                                "Annotation database header - All annotations added: "
 3792                                + str(annotation_fields)
 3793                            )
 3794                            annotation_fields_full = True
 3795
 3796                        # # Create file for field rename
 3797                        # log.debug("Create file for field rename")
 3798                        # tmp_rename = NamedTemporaryFile(
 3799                        #     prefix=self.get_prefix(),
 3800                        #     dir=self.get_tmp_dir(),
 3801                        #     suffix=".rename",
 3802                        #     delete=False,
 3803                        # )
 3804                        # tmp_rename_name = tmp_rename.name
 3805                        # tmp_files.append(tmp_rename_name)
 3806
 3807                        # Number of fields
 3808                        nb_annotation_field = 0
 3809                        annotation_list = []
 3810                        annotation_infos_rename_list = []
 3811
 3812                        for annotation_field in annotation_fields:
 3813
 3814                            # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
 3815                            annotation_fields_new_name = annotation_fields.get(
 3816                                annotation_field, annotation_field
 3817                            )
 3818                            if not annotation_fields_new_name:
 3819                                annotation_fields_new_name = annotation_field
 3820
 3821                            # Check if field is in DB and if field is not elready in input data
 3822                            if (
 3823                                annotation_field in db_hdr_vcf.get_header().infos
 3824                                and annotation_fields_new_name
 3825                                not in self.get_header().infos
 3826                            ):
 3827
 3828                                log.info(
 3829                                    f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
 3830                                )
 3831
 3832                                # BCFTools annotate param to rename fields
 3833                                if annotation_field != annotation_fields_new_name:
 3834                                    annotation_infos_rename_list.append(
 3835                                        f"{annotation_fields_new_name}:=INFO/{annotation_field}"
 3836                                    )
 3837
 3838                                # Add INFO field to header
 3839                                db_hdr_vcf_header_infos_number = (
 3840                                    db_hdr_vcf_header_infos[annotation_field].num or "."
 3841                                )
 3842                                db_hdr_vcf_header_infos_type = (
 3843                                    db_hdr_vcf_header_infos[annotation_field].type
 3844                                    or "String"
 3845                                )
 3846                                db_hdr_vcf_header_infos_description = (
 3847                                    db_hdr_vcf_header_infos[annotation_field].desc
 3848                                    or f"{annotation_field} description"
 3849                                )
 3850                                db_hdr_vcf_header_infos_source = (
 3851                                    db_hdr_vcf_header_infos[annotation_field].source
 3852                                    or "unknown"
 3853                                )
 3854                                db_hdr_vcf_header_infos_version = (
 3855                                    db_hdr_vcf_header_infos[annotation_field].version
 3856                                    or "unknown"
 3857                                )
 3858
 3859                                vcf_reader.infos[annotation_fields_new_name] = (
 3860                                    vcf.parser._Info(
 3861                                        annotation_fields_new_name,
 3862                                        db_hdr_vcf_header_infos_number,
 3863                                        db_hdr_vcf_header_infos_type,
 3864                                        db_hdr_vcf_header_infos_description,
 3865                                        db_hdr_vcf_header_infos_source,
 3866                                        db_hdr_vcf_header_infos_version,
 3867                                        self.code_type_map[
 3868                                            db_hdr_vcf_header_infos_type
 3869                                        ],
 3870                                    )
 3871                                )
 3872
 3873                                annotation_list.append(annotation_field)
 3874
 3875                                nb_annotation_field += 1
 3876
 3877                            else:
 3878
 3879                                if (
 3880                                    annotation_field
 3881                                    not in db_hdr_vcf.get_header().infos
 3882                                ):
 3883                                    log.warning(
 3884                                        f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file"
 3885                                    )
 3886                                if (
 3887                                    annotation_fields_new_name
 3888                                    in self.get_header().infos
 3889                                ):
 3890                                    log.warning(
 3891                                        f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)"
 3892                                    )
 3893
 3894                        log.info(
 3895                            f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
 3896                        )
 3897
 3898                        annotation_infos = ",".join(annotation_list)
 3899
 3900                        if annotation_infos != "":
 3901
 3902                            # Annotated VCF (and error file)
 3903                            tmp_annotation_vcf_name = os.path.join(
 3904                                tmp_dir, os.path.basename(annotation) + ".vcf.gz"
 3905                            )
 3906                            tmp_annotation_vcf_name_err = (
 3907                                tmp_annotation_vcf_name + ".err"
 3908                            )
 3909
 3910                            # Add fields to annotate
 3911                            if not annotation_fields_full:
 3912                                annotation_infos_option = f"-info {annotation_infos}"
 3913                            else:
 3914                                annotation_infos_option = ""
 3915
 3916                            # Info fields rename
 3917                            if annotation_infos_rename_list:
 3918                                annotation_infos_rename = " -c " + ",".join(
 3919                                    annotation_infos_rename_list
 3920                                )
 3921                            else:
 3922                                annotation_infos_rename = ""
 3923
 3924                            # Annotate command
 3925                            command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
 3926
 3927                            # Add command
 3928                            commands[command_annotate] = tmp_annotation_vcf_name
 3929
 3930                if commands:
 3931
 3932                    # Export VCF file
 3933                    self.export_variant_vcf(
 3934                        vcf_file=tmp_vcf_name,
 3935                        remove_info=True,
 3936                        add_samples=False,
 3937                        index=True,
 3938                    )
 3939                    shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf")
 3940
 3941                    # Num command
 3942                    nb_command = 0
 3943
 3944                    # Annotate
 3945                    for command_annotate in commands:
 3946                        nb_command += 1
 3947                        log.info(
 3948                            f"Annotation - Annotate [{nb_command}/{len(commands)}]..."
 3949                        )
 3950                        log.debug(f"command_annotate={command_annotate}")
 3951                        run_parallel_commands([command_annotate], threads)
 3952
 3953                        # Debug
 3954                        shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf")
 3955
 3956                        # Update variants
 3957                        log.info(
 3958                            f"Annotation - Updating [{nb_command}/{len(commands)}]..."
 3959                        )
 3960                        self.update_from_vcf(commands[command_annotate])
 3961
 3962    def annotation_bcftools(self, threads: int = None) -> None:
 3963        """
 3964        This function annotate with bcftools
 3965
 3966        :param threads: Number of threads to use
 3967        :return: the value of the variable "return_value".
 3968        """
 3969
 3970        # DEBUG
 3971        log.debug("Start annotation with bcftools databases")
 3972
 3973        # Threads
 3974        if not threads:
 3975            threads = self.get_threads()
 3976        log.debug("Threads: " + str(threads))
 3977
 3978        # Config
 3979        config = self.get_config()
 3980        log.debug("Config: " + str(config))
 3981
 3982        # DEBUG
 3983        delete_tmp = True
 3984        if self.get_config().get("verbosity", "warning") in ["debug"]:
 3985            delete_tmp = False
 3986            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 3987
 3988        # Config - BCFTools bin command
 3989        bcftools_bin_command = get_bin_command(
 3990            bin="bcftools",
 3991            tool="bcftools",
 3992            bin_type="bin",
 3993            config=config,
 3994            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
 3995        )
 3996        if not bcftools_bin_command:
 3997            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
 3998            log.error(msg_err)
 3999            raise ValueError(msg_err)
 4000
 4001        # Config - BCFTools databases folders
 4002        databases_folders = set(
 4003            self.get_config()
 4004            .get("folders", {})
 4005            .get("databases", {})
 4006            .get("annotations", ["."])
 4007            + self.get_config()
 4008            .get("folders", {})
 4009            .get("databases", {})
 4010            .get("bcftools", ["."])
 4011        )
 4012        log.debug("Databases annotations: " + str(databases_folders))
 4013
 4014        # Param
 4015        annotations = (
 4016            self.get_param()
 4017            .get("annotation", {})
 4018            .get("bcftools", {})
 4019            .get("annotations", None)
 4020        )
 4021        log.debug("Annotations: " + str(annotations))
 4022
 4023        # Assembly
 4024        assembly = self.get_param().get(
 4025            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 4026        )
 4027
 4028        # Data
 4029        table_variants = self.get_table_variants()
 4030
 4031        # Check if not empty
 4032        log.debug("Check if not empty")
 4033        sql_query_chromosomes = (
 4034            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 4035        )
 4036        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 4037        if not sql_query_chromosomes_df["count"][0]:
 4038            log.info(f"VCF empty")
 4039            return
 4040
 4041        # Export in VCF
 4042        log.debug("Create initial file to annotate")
 4043        tmp_vcf = NamedTemporaryFile(
 4044            prefix=self.get_prefix(),
 4045            dir=self.get_tmp_dir(),
 4046            suffix=".vcf.gz",
 4047            delete=False,
 4048        )
 4049        tmp_vcf_name = tmp_vcf.name
 4050
 4051        # VCF header
 4052        vcf_reader = self.get_header()
 4053        log.debug("Initial header: " + str(vcf_reader.infos))
 4054
 4055        # Existing annotations
 4056        for vcf_annotation in self.get_header().infos:
 4057
 4058            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 4059            log.debug(
 4060                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 4061            )
 4062
 4063        if annotations:
 4064
 4065            tmp_ann_vcf_list = []
 4066            commands = []
 4067            tmp_files = []
 4068            err_files = []
 4069
 4070            for annotation in annotations:
 4071                annotation_fields = annotations[annotation]
 4072
 4073                # Annotation Name
 4074                annotation_name = os.path.basename(annotation)
 4075
 4076                if not annotation_fields:
 4077                    annotation_fields = {"INFO": None}
 4078
 4079                log.debug(f"Annotation '{annotation_name}'")
 4080                log.debug(
 4081                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 4082                )
 4083
 4084                # Create Database
 4085                database = Database(
 4086                    database=annotation,
 4087                    databases_folders=databases_folders,
 4088                    assembly=assembly,
 4089                )
 4090
 4091                # Find files
 4092                db_file = database.get_database()
 4093                db_file = full_path(db_file)
 4094                db_hdr_file = database.get_header_file()
 4095                db_hdr_file = full_path(db_hdr_file)
 4096                db_file_type = database.get_format()
 4097                db_tbi_file = f"{db_file}.tbi"
 4098                db_file_compressed = database.is_compressed()
 4099
 4100                # Check if compressed
 4101                if not db_file_compressed:
 4102                    log.error(
 4103                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
 4104                    )
 4105                    raise ValueError(
 4106                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
 4107                    )
 4108
 4109                # Check if indexed
 4110                if not os.path.exists(db_tbi_file):
 4111                    log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file")
 4112                    raise ValueError(
 4113                        f"Annotation '{annotation}' - {db_file} NOT indexed file"
 4114                    )
 4115
 4116                # Check index - try to create if not exists
 4117                if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
 4118                    log.error("Annotation failed: database not valid")
 4119                    log.error(f"Annotation annotation file: {db_file}")
 4120                    log.error(f"Annotation annotation header: {db_hdr_file}")
 4121                    log.error(f"Annotation annotation index: {db_tbi_file}")
 4122                    raise ValueError(
 4123                        f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
 4124                    )
 4125                else:
 4126
 4127                    log.debug(
 4128                        f"Annotation '{annotation}' - file: "
 4129                        + str(db_file)
 4130                        + " and "
 4131                        + str(db_hdr_file)
 4132                    )
 4133
 4134                    # Load header as VCF object
 4135                    db_hdr_vcf = Variants(input=db_hdr_file)
 4136                    db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
 4137                    log.debug(
 4138                        "Annotation database header: " + str(db_hdr_vcf_header_infos)
 4139                    )
 4140
 4141                    # For all fields in database
 4142                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
 4143                        annotation_fields = {
 4144                            key: key for key in db_hdr_vcf_header_infos
 4145                        }
 4146                        log.debug(
 4147                            "Annotation database header - All annotations added: "
 4148                            + str(annotation_fields)
 4149                        )
 4150
 4151                    # Number of fields
 4152                    nb_annotation_field = 0
 4153                    annotation_list = []
 4154
 4155                    for annotation_field in annotation_fields:
 4156
 4157                        # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
 4158                        annotation_fields_new_name = annotation_fields.get(
 4159                            annotation_field, annotation_field
 4160                        )
 4161                        if not annotation_fields_new_name:
 4162                            annotation_fields_new_name = annotation_field
 4163
 4164                        # Check if field is in DB and if field is not elready in input data
 4165                        if (
 4166                            annotation_field in db_hdr_vcf.get_header().infos
 4167                            and annotation_fields_new_name
 4168                            not in self.get_header().infos
 4169                        ):
 4170
 4171                            log.info(
 4172                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
 4173                            )
 4174
 4175                            # Add INFO field to header
 4176                            db_hdr_vcf_header_infos_number = (
 4177                                db_hdr_vcf_header_infos[annotation_field].num or "."
 4178                            )
 4179                            db_hdr_vcf_header_infos_type = (
 4180                                db_hdr_vcf_header_infos[annotation_field].type
 4181                                or "String"
 4182                            )
 4183                            db_hdr_vcf_header_infos_description = (
 4184                                db_hdr_vcf_header_infos[annotation_field].desc
 4185                                or f"{annotation_field} description"
 4186                            )
 4187                            db_hdr_vcf_header_infos_source = (
 4188                                db_hdr_vcf_header_infos[annotation_field].source
 4189                                or "unknown"
 4190                            )
 4191                            db_hdr_vcf_header_infos_version = (
 4192                                db_hdr_vcf_header_infos[annotation_field].version
 4193                                or "unknown"
 4194                            )
 4195
 4196                            vcf_reader.infos[annotation_fields_new_name] = (
 4197                                vcf.parser._Info(
 4198                                    annotation_fields_new_name,
 4199                                    db_hdr_vcf_header_infos_number,
 4200                                    db_hdr_vcf_header_infos_type,
 4201                                    db_hdr_vcf_header_infos_description,
 4202                                    db_hdr_vcf_header_infos_source,
 4203                                    db_hdr_vcf_header_infos_version,
 4204                                    self.code_type_map[db_hdr_vcf_header_infos_type],
 4205                                )
 4206                            )
 4207
 4208                            # annotation_list.append(annotation_field)
 4209                            if annotation_field != annotation_fields_new_name:
 4210                                annotation_list.append(
 4211                                    f"{annotation_fields_new_name}:=INFO/{annotation_field}"
 4212                                )
 4213                            else:
 4214                                annotation_list.append(annotation_field)
 4215
 4216                            nb_annotation_field += 1
 4217
 4218                        else:
 4219
 4220                            if annotation_field not in db_hdr_vcf.get_header().infos:
 4221                                log.warning(
 4222                                    f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file"
 4223                                )
 4224                            if annotation_fields_new_name in self.get_header().infos:
 4225                                log.warning(
 4226                                    f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
 4227                                )
 4228
 4229                    log.info(
 4230                        f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
 4231                    )
 4232
 4233                    annotation_infos = ",".join(annotation_list)
 4234
 4235                    if annotation_infos != "":
 4236
 4237                        # Protect header for bcftools (remove "#CHROM" and variants line)
 4238                        log.debug("Protect Header file - remove #CHROM line if exists")
 4239                        tmp_header_vcf = NamedTemporaryFile(
 4240                            prefix=self.get_prefix(),
 4241                            dir=self.get_tmp_dir(),
 4242                            suffix=".hdr",
 4243                            delete=False,
 4244                        )
 4245                        tmp_header_vcf_name = tmp_header_vcf.name
 4246                        tmp_files.append(tmp_header_vcf_name)
 4247                        # Command
 4248                        if db_hdr_file.endswith(".gz"):
 4249                            command_extract_header = f"zcat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
 4250                        else:
 4251                            command_extract_header = f"cat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
 4252                        # Run
 4253                        run_parallel_commands([command_extract_header], 1)
 4254
 4255                        # Find chomosomes
 4256                        log.debug("Find chromosomes ")
 4257                        sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\""""
 4258                        sql_query_chromosomes_df = self.get_query_to_df(
 4259                            sql_query_chromosomes
 4260                        )
 4261                        chomosomes_list = list(sql_query_chromosomes_df["CHROM"])
 4262
 4263                        log.debug("Chromosomes found: " + str(list(chomosomes_list)))
 4264
 4265                        # BED columns in the annotation file
 4266                        if db_file_type in ["bed"]:
 4267                            annotation_infos = "CHROM,POS,POS," + annotation_infos
 4268
 4269                        for chrom in chomosomes_list:
 4270
 4271                            # Create BED on initial VCF
 4272                            log.debug("Create BED on initial VCF: " + str(tmp_vcf_name))
 4273                            tmp_bed = NamedTemporaryFile(
 4274                                prefix=self.get_prefix(),
 4275                                dir=self.get_tmp_dir(),
 4276                                suffix=".bed",
 4277                                delete=False,
 4278                            )
 4279                            tmp_bed_name = tmp_bed.name
 4280                            tmp_files.append(tmp_bed_name)
 4281
 4282                            # Detecte regions
 4283                            log.debug(
 4284                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..."
 4285                            )
 4286                            window = 1000000
 4287                            sql_query_intervals_for_bed = f"""
 4288                                SELECT  \"#CHROM\",
 4289                                        CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END,
 4290                                        \"POS\"+{window}
 4291                                FROM {table_variants} as table_variants
 4292                                WHERE table_variants.\"#CHROM\" = '{chrom}'
 4293                            """
 4294                            regions = self.conn.execute(
 4295                                sql_query_intervals_for_bed
 4296                            ).fetchall()
 4297                            merged_regions = merge_regions(regions)
 4298                            log.debug(
 4299                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..."
 4300                            )
 4301
 4302                            header = ["#CHROM", "START", "END"]
 4303                            with open(tmp_bed_name, "w") as f:
 4304                                # Write the header with tab delimiter
 4305                                f.write("\t".join(header) + "\n")
 4306                                for d in merged_regions:
 4307                                    # Write each data row with tab delimiter
 4308                                    f.write("\t".join(map(str, d)) + "\n")
 4309
 4310                            # Tmp files
 4311                            tmp_annotation_vcf = NamedTemporaryFile(
 4312                                prefix=self.get_prefix(),
 4313                                dir=self.get_tmp_dir(),
 4314                                suffix=".vcf.gz",
 4315                                delete=False,
 4316                            )
 4317                            tmp_annotation_vcf_name = tmp_annotation_vcf.name
 4318                            tmp_files.append(tmp_annotation_vcf_name)
 4319                            tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}")
 4320                            tmp_annotation_vcf_name_err = (
 4321                                tmp_annotation_vcf_name + ".err"
 4322                            )
 4323                            err_files.append(tmp_annotation_vcf_name_err)
 4324
 4325                            # Annotate Command
 4326                            log.debug(
 4327                                f"Annotation '{annotation}' - add bcftools command"
 4328                            )
 4329
 4330                            # Command
 4331                            command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
 4332
 4333                            # Add command
 4334                            commands.append(command_annotate)
 4335
 4336            # if some commands
 4337            if commands:
 4338
 4339                # Export VCF file
 4340                self.export_variant_vcf(
 4341                    vcf_file=tmp_vcf_name,
 4342                    remove_info=True,
 4343                    add_samples=False,
 4344                    index=True,
 4345                )
 4346
 4347                # Threads
 4348                # calculate threads for annotated commands
 4349                if commands:
 4350                    threads_bcftools_annotate = round(threads / len(commands))
 4351                else:
 4352                    threads_bcftools_annotate = 1
 4353
 4354                if not threads_bcftools_annotate:
 4355                    threads_bcftools_annotate = 1
 4356
 4357                # Add threads option to bcftools commands
 4358                if threads_bcftools_annotate > 1:
 4359                    commands_threaded = []
 4360                    for command in commands:
 4361                        commands_threaded.append(
 4362                            command.replace(
 4363                                f"{bcftools_bin_command} annotate ",
 4364                                f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ",
 4365                            )
 4366                        )
 4367                    commands = commands_threaded
 4368
 4369                # Command annotation multithreading
 4370                log.debug(f"Annotation - Annotation commands: " + str(commands))
 4371                log.info(
 4372                    f"Annotation - Annotation multithreaded in "
 4373                    + str(len(commands))
 4374                    + " commands"
 4375                )
 4376
 4377                run_parallel_commands(commands, threads)
 4378
 4379                # Merge
 4380                tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list)
 4381
 4382                if tmp_ann_vcf_list_cmd:
 4383
 4384                    # Tmp file
 4385                    tmp_annotate_vcf = NamedTemporaryFile(
 4386                        prefix=self.get_prefix(),
 4387                        dir=self.get_tmp_dir(),
 4388                        suffix=".vcf.gz",
 4389                        delete=True,
 4390                    )
 4391                    tmp_annotate_vcf_name = tmp_annotate_vcf.name
 4392                    tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
 4393                    err_files.append(tmp_annotate_vcf_name_err)
 4394
 4395                    # Tmp file remove command
 4396                    tmp_files_remove_command = ""
 4397                    if tmp_files:
 4398                        tmp_files_remove_command = " && rm -f " + " ".join(tmp_files)
 4399
 4400                    # Command merge
 4401                    merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}"
 4402                    log.info(
 4403                        f"Annotation - Annotation merging "
 4404                        + str(len(commands))
 4405                        + " annotated files"
 4406                    )
 4407                    log.debug(f"Annotation - merge command: {merge_command}")
 4408                    run_parallel_commands([merge_command], 1)
 4409
 4410                    # Error messages
 4411                    log.info(f"Error/Warning messages:")
 4412                    error_message_command_all = []
 4413                    error_message_command_warning = []
 4414                    error_message_command_err = []
 4415                    for err_file in err_files:
 4416                        with open(err_file, "r") as f:
 4417                            for line in f:
 4418                                message = line.strip()
 4419                                error_message_command_all.append(message)
 4420                                if line.startswith("[W::"):
 4421                                    error_message_command_warning.append(message)
 4422                                if line.startswith("[E::"):
 4423                                    error_message_command_err.append(
 4424                                        f"{err_file}: " + message
 4425                                    )
 4426                    # log info
 4427                    for message in list(
 4428                        set(error_message_command_err + error_message_command_warning)
 4429                    ):
 4430                        log.info(f"   {message}")
 4431                    # debug info
 4432                    for message in list(set(error_message_command_all)):
 4433                        log.debug(f"   {message}")
 4434                    # failed
 4435                    if len(error_message_command_err):
 4436                        log.error("Annotation failed: Error in commands")
 4437                        raise ValueError("Annotation failed: Error in commands")
 4438
 4439                    # Update variants
 4440                    log.info(f"Annotation - Updating...")
 4441                    self.update_from_vcf(tmp_annotate_vcf_name)
 4442
 4443    def annotation_exomiser(self, threads: int = None) -> None:
 4444        """
 4445        This function annotate with Exomiser
 4446
 4447        This function uses args as parameters, in section "annotation" -> "exomiser", with sections:
 4448        - "analysis" (dict/file):
 4449            Full analysis dictionnary parameters (see Exomiser docs).
 4450            Either a dict, or a file in JSON or YAML format.
 4451            These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO)
 4452            Default : None
 4453        - "preset" (string):
 4454            Analysis preset (available in config folder).
 4455            Used if no full "analysis" is provided.
 4456            Default: "exome"
 4457        - "phenopacket" (dict/file):
 4458            Samples and phenotipic features parameters (see Exomiser docs).
 4459            Either a dict, or a file in JSON or YAML format.
 4460            Default: None
 4461        - "subject" (dict):
 4462            Sample parameters (see Exomiser docs).
 4463            Example:
 4464                "subject":
 4465                    {
 4466                        "id": "ISDBM322017",
 4467                        "sex": "FEMALE"
 4468                    }
 4469            Default: None
 4470        - "sample" (string):
 4471            Sample name to construct "subject" section:
 4472                "subject":
 4473                    {
 4474                        "id": "<sample>",
 4475                        "sex": "UNKNOWN_SEX"
 4476                    }
 4477            Default: None
 4478        - "phenotypicFeatures" (dict)
 4479            Phenotypic features to construct "subject" section.
 4480            Example:
 4481                "phenotypicFeatures":
 4482                    [
 4483                        { "type": { "id": "HP:0001159", "label": "Syndactyly" } },
 4484                        { "type": { "id": "HP:0000486", "label": "Strabismus" } }
 4485                    ]
 4486        - "hpo" (list)
 4487            List of HPO ids as phenotypic features.
 4488            Example:
 4489                "hpo": ['0001156', '0001363', '0011304', '0010055']
 4490            Default: []
 4491        - "outputOptions" (dict):
 4492            Output options (see Exomiser docs).
 4493            Default:
 4494                "output_options" =
 4495                    {
 4496                        "outputContributingVariantsOnly": False,
 4497                        "numGenes": 0,
 4498                        "outputFormats": ["TSV_VARIANT", "VCF"]
 4499                    }
 4500        - "transcript_source" (string):
 4501            Transcript source (either "refseq", "ucsc", "ensembl")
 4502            Default: "refseq"
 4503        - "exomiser_to_info" (boolean):
 4504            Add exomiser TSV file columns as INFO fields in VCF.
 4505            Default: False
 4506        - "release" (string):
 4507            Exomise database release.
 4508            If not exists, database release will be downloaded (take a while).
 4509            Default: None (provided by application.properties configuration file)
 4510        - "exomiser_application_properties" (file):
 4511            Exomiser configuration file (see Exomiser docs).
 4512            Useful to automatically download databases (especially for specific genome databases).
 4513
 4514        Notes:
 4515        - If no sample in parameters, first sample in VCF will be chosen
 4516        - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off
 4517
 4518        :param threads: The number of threads to use
 4519        :return: None.
 4520        """
 4521
 4522        # DEBUG
 4523        log.debug("Start annotation with Exomiser databases")
 4524
 4525        # Threads
 4526        if not threads:
 4527            threads = self.get_threads()
 4528        log.debug("Threads: " + str(threads))
 4529
 4530        # Config
 4531        config = self.get_config()
 4532        log.debug("Config: " + str(config))
 4533
 4534        # Config - Folders - Databases
 4535        databases_folders = (
 4536            config.get("folders", {})
 4537            .get("databases", {})
 4538            .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current")
 4539        )
 4540        databases_folders = full_path(databases_folders)
 4541        if not os.path.exists(databases_folders):
 4542            log.error(f"Databases annotations: {databases_folders} NOT found")
 4543        log.debug("Databases annotations: " + str(databases_folders))
 4544
 4545        # Config - Exomiser
 4546        exomiser_bin_command = get_bin_command(
 4547            bin="exomiser-cli*.jar",
 4548            tool="exomiser",
 4549            bin_type="jar",
 4550            config=config,
 4551            default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser",
 4552        )
 4553        log.debug("Exomiser bin command: " + str(exomiser_bin_command))
 4554        if not exomiser_bin_command:
 4555            msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'"
 4556            log.error(msg_err)
 4557            raise ValueError(msg_err)
 4558
 4559        # Param
 4560        param = self.get_param()
 4561        log.debug("Param: " + str(param))
 4562
 4563        # Param - Exomiser
 4564        param_exomiser = param.get("annotation", {}).get("exomiser", {})
 4565        log.debug(f"Param Exomiser: {param_exomiser}")
 4566
 4567        # Param - Assembly
 4568        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 4569        log.debug("Assembly: " + str(assembly))
 4570
 4571        # Data
 4572        table_variants = self.get_table_variants()
 4573
 4574        # Check if not empty
 4575        log.debug("Check if not empty")
 4576        sql_query_chromosomes = (
 4577            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 4578        )
 4579        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
 4580            log.info(f"VCF empty")
 4581            return False
 4582
 4583        # VCF header
 4584        vcf_reader = self.get_header()
 4585        log.debug("Initial header: " + str(vcf_reader.infos))
 4586
 4587        # Samples
 4588        samples = self.get_header_sample_list()
 4589        if not samples:
 4590            log.error("No Samples in VCF")
 4591            return False
 4592        log.debug(f"Samples: {samples}")
 4593
 4594        # Memory limit
 4595        memory_limit = self.get_memory("8G")
 4596        log.debug(f"memory_limit: {memory_limit}")
 4597
 4598        # Exomiser java options
 4599        exomiser_java_options = (
 4600            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
 4601        )
 4602        log.debug(f"Exomiser java options: {exomiser_java_options}")
 4603
 4604        # Download Exomiser (if not exists)
 4605        exomiser_release = param_exomiser.get("release", None)
 4606        exomiser_application_properties = param_exomiser.get(
 4607            "exomiser_application_properties", None
 4608        )
 4609        databases_download_exomiser(
 4610            assemblies=[assembly],
 4611            exomiser_folder=databases_folders,
 4612            exomiser_release=exomiser_release,
 4613            exomiser_phenotype_release=exomiser_release,
 4614            exomiser_application_properties=exomiser_application_properties,
 4615        )
 4616
 4617        # Force annotation
 4618        force_update_annotation = True
 4619
 4620        if "Exomiser" not in self.get_header().infos or force_update_annotation:
 4621            log.debug("Start annotation Exomiser")
 4622
 4623            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
 4624
 4625                # tmp_dir = "/tmp/exomiser"
 4626
 4627                ### ANALYSIS ###
 4628                ################
 4629
 4630                # Create analysis.json through analysis dict
 4631                # either analysis in param or by default
 4632                # depending on preset exome/genome)
 4633
 4634                # Init analysis dict
 4635                param_exomiser_analysis_dict = {}
 4636
 4637                # analysis from param
 4638                param_exomiser_analysis = param_exomiser.get("analysis", {})
 4639                param_exomiser_analysis = full_path(param_exomiser_analysis)
 4640
 4641                # If analysis in param -> load anlaysis json
 4642                if param_exomiser_analysis:
 4643
 4644                    # If param analysis is a file and exists
 4645                    if isinstance(param_exomiser_analysis, str) and os.path.exists(
 4646                        param_exomiser_analysis
 4647                    ):
 4648                        # Load analysis file into analysis dict (either yaml or json)
 4649                        with open(param_exomiser_analysis) as json_file:
 4650                            param_exomiser_analysis_dict = yaml.safe_load(json_file)
 4651
 4652                    # If param analysis is a dict
 4653                    elif isinstance(param_exomiser_analysis, dict):
 4654                        # Load analysis dict into analysis dict (either yaml or json)
 4655                        param_exomiser_analysis_dict = param_exomiser_analysis
 4656
 4657                    # Error analysis type
 4658                    else:
 4659                        log.error(f"Analysis type unknown. Check param file.")
 4660                        raise ValueError(f"Analysis type unknown. Check param file.")
 4661
 4662                # Case no input analysis config file/dict
 4663                # Use preset (exome/genome) to open default config file
 4664                if not param_exomiser_analysis_dict:
 4665
 4666                    # default preset
 4667                    default_preset = "exome"
 4668
 4669                    # Get param preset or default preset
 4670                    param_exomiser_preset = param_exomiser.get("preset", default_preset)
 4671
 4672                    # Try to find if preset is a file
 4673                    if os.path.exists(param_exomiser_preset):
 4674                        # Preset file is provided in full path
 4675                        param_exomiser_analysis_default_config_file = (
 4676                            param_exomiser_preset
 4677                        )
 4678                    # elif os.path.exists(full_path(param_exomiser_preset)):
 4679                    #     # Preset file is provided in full path
 4680                    #     param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset)
 4681                    elif os.path.exists(
 4682                        os.path.join(folder_config, param_exomiser_preset)
 4683                    ):
 4684                        # Preset file is provided a basename in config folder (can be a path with subfolders)
 4685                        param_exomiser_analysis_default_config_file = os.path.join(
 4686                            folder_config, param_exomiser_preset
 4687                        )
 4688                    else:
 4689                        # Construct preset file
 4690                        param_exomiser_analysis_default_config_file = os.path.join(
 4691                            folder_config,
 4692                            f"preset-{param_exomiser_preset}-analysis.json",
 4693                        )
 4694
 4695                    # If preset file exists
 4696                    param_exomiser_analysis_default_config_file = full_path(
 4697                        param_exomiser_analysis_default_config_file
 4698                    )
 4699                    if os.path.exists(param_exomiser_analysis_default_config_file):
 4700                        # Load prest file into analysis dict (either yaml or json)
 4701                        with open(
 4702                            param_exomiser_analysis_default_config_file
 4703                        ) as json_file:
 4704                            # param_exomiser_analysis_dict[""] = json.load(json_file)
 4705                            param_exomiser_analysis_dict["analysis"] = yaml.safe_load(
 4706                                json_file
 4707                            )
 4708
 4709                    # Error preset file
 4710                    else:
 4711                        log.error(
 4712                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
 4713                        )
 4714                        raise ValueError(
 4715                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
 4716                        )
 4717
 4718                # If no analysis dict created
 4719                if not param_exomiser_analysis_dict:
 4720                    log.error(f"No analysis config")
 4721                    raise ValueError(f"No analysis config")
 4722
 4723                # Log
 4724                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
 4725
 4726                ### PHENOPACKET ###
 4727                ###################
 4728
 4729                # If no PhenoPacket in analysis dict -> check in param
 4730                if "phenopacket" not in param_exomiser_analysis_dict:
 4731
 4732                    # If PhenoPacket in param -> load anlaysis json
 4733                    if param_exomiser.get("phenopacket", None):
 4734
 4735                        param_exomiser_phenopacket = param_exomiser.get("phenopacket")
 4736                        param_exomiser_phenopacket = full_path(
 4737                            param_exomiser_phenopacket
 4738                        )
 4739
 4740                        # If param phenopacket is a file and exists
 4741                        if isinstance(
 4742                            param_exomiser_phenopacket, str
 4743                        ) and os.path.exists(param_exomiser_phenopacket):
 4744                            # Load phenopacket file into analysis dict (either yaml or json)
 4745                            with open(param_exomiser_phenopacket) as json_file:
 4746                                param_exomiser_analysis_dict["phenopacket"] = (
 4747                                    yaml.safe_load(json_file)
 4748                                )
 4749
 4750                        # If param phenopacket is a dict
 4751                        elif isinstance(param_exomiser_phenopacket, dict):
 4752                            # Load phenopacket dict into analysis dict (either yaml or json)
 4753                            param_exomiser_analysis_dict["phenopacket"] = (
 4754                                param_exomiser_phenopacket
 4755                            )
 4756
 4757                        # Error phenopacket type
 4758                        else:
 4759                            log.error(f"Phenopacket type unknown. Check param file.")
 4760                            raise ValueError(
 4761                                f"Phenopacket type unknown. Check param file."
 4762                            )
 4763
 4764                # If no PhenoPacket in analysis dict -> construct from sample and HPO in param
 4765                if "phenopacket" not in param_exomiser_analysis_dict:
 4766
 4767                    # Init PhenoPacket
 4768                    param_exomiser_analysis_dict["phenopacket"] = {
 4769                        "id": "analysis",
 4770                        "proband": {},
 4771                    }
 4772
 4773                    ### Add subject ###
 4774
 4775                    # If subject exists
 4776                    param_exomiser_subject = param_exomiser.get("subject", {})
 4777
 4778                    # If subject not exists -> found sample ID
 4779                    if not param_exomiser_subject:
 4780
 4781                        # Found sample ID in param
 4782                        sample = param_exomiser.get("sample", None)
 4783
 4784                        # Find sample ID (first sample)
 4785                        if not sample:
 4786                            sample_list = self.get_header_sample_list()
 4787                            if len(sample_list) > 0:
 4788                                sample = sample_list[0]
 4789                            else:
 4790                                log.error(f"No sample found")
 4791                                raise ValueError(f"No sample found")
 4792
 4793                        # Create subject
 4794                        param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"}
 4795
 4796                    # Add to dict
 4797                    param_exomiser_analysis_dict["phenopacket"][
 4798                        "subject"
 4799                    ] = param_exomiser_subject
 4800
 4801                    ### Add "phenotypicFeatures" ###
 4802
 4803                    # If phenotypicFeatures exists
 4804                    param_exomiser_phenotypicfeatures = param_exomiser.get(
 4805                        "phenotypicFeatures", []
 4806                    )
 4807
 4808                    # If phenotypicFeatures not exists -> Try to infer from hpo list
 4809                    if not param_exomiser_phenotypicfeatures:
 4810
 4811                        # Found HPO in param
 4812                        param_exomiser_hpo = param_exomiser.get("hpo", [])
 4813
 4814                        # Split HPO if list in string format separated by comma
 4815                        if isinstance(param_exomiser_hpo, str):
 4816                            param_exomiser_hpo = param_exomiser_hpo.split(",")
 4817
 4818                        # Create HPO list
 4819                        for hpo in param_exomiser_hpo:
 4820                            hpo_clean = re.sub("[^0-9]", "", hpo)
 4821                            param_exomiser_phenotypicfeatures.append(
 4822                                {
 4823                                    "type": {
 4824                                        "id": f"HP:{hpo_clean}",
 4825                                        "label": f"HP:{hpo_clean}",
 4826                                    }
 4827                                }
 4828                            )
 4829
 4830                    # Add to dict
 4831                    param_exomiser_analysis_dict["phenopacket"][
 4832                        "phenotypicFeatures"
 4833                    ] = param_exomiser_phenotypicfeatures
 4834
 4835                    # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step
 4836                    if not param_exomiser_phenotypicfeatures:
 4837                        for step in param_exomiser_analysis_dict.get(
 4838                            "analysis", {}
 4839                        ).get("steps", []):
 4840                            if "hiPhivePrioritiser" in step:
 4841                                param_exomiser_analysis_dict.get("analysis", {}).get(
 4842                                    "steps", []
 4843                                ).remove(step)
 4844
 4845                ### Add Input File ###
 4846
 4847                # Initial file name and htsFiles
 4848                tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz")
 4849                param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [
 4850                    {
 4851                        "uri": tmp_vcf_name,
 4852                        "htsFormat": "VCF",
 4853                        "genomeAssembly": assembly,
 4854                    }
 4855                ]
 4856
 4857                ### Add metaData ###
 4858
 4859                # If metaData not in analysis dict
 4860                if "metaData" not in param_exomiser_analysis_dict:
 4861                    param_exomiser_analysis_dict["phenopacket"]["metaData"] = {
 4862                        "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z",
 4863                        "createdBy": "howard",
 4864                        "phenopacketSchemaVersion": 1,
 4865                    }
 4866
 4867                ### OutputOptions ###
 4868
 4869                # Init output result folder
 4870                output_results = os.path.join(tmp_dir, "results")
 4871
 4872                # If no outputOptions in analysis dict
 4873                if "outputOptions" not in param_exomiser_analysis_dict:
 4874
 4875                    # default output formats
 4876                    defaut_output_formats = ["TSV_VARIANT", "VCF"]
 4877
 4878                    # Get outputOptions in param
 4879                    output_options = param_exomiser.get("outputOptions", None)
 4880
 4881                    # If no output_options in param -> check
 4882                    if not output_options:
 4883                        output_options = {
 4884                            "outputContributingVariantsOnly": False,
 4885                            "numGenes": 0,
 4886                            "outputFormats": defaut_output_formats,
 4887                        }
 4888
 4889                    # Replace outputDirectory in output options
 4890                    output_options["outputDirectory"] = output_results
 4891                    output_options["outputFileName"] = "howard"
 4892
 4893                    # Add outputOptions in analysis dict
 4894                    param_exomiser_analysis_dict["outputOptions"] = output_options
 4895
 4896                else:
 4897
 4898                    # Replace output_results and output format (if exists in param)
 4899                    param_exomiser_analysis_dict["outputOptions"][
 4900                        "outputDirectory"
 4901                    ] = output_results
 4902                    param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = (
 4903                        list(
 4904                            set(
 4905                                param_exomiser_analysis_dict.get(
 4906                                    "outputOptions", {}
 4907                                ).get("outputFormats", [])
 4908                                + ["TSV_VARIANT", "VCF"]
 4909                            )
 4910                        )
 4911                    )
 4912
 4913                # log
 4914                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
 4915
 4916                ### ANALYSIS FILE ###
 4917                #####################
 4918
 4919                ### Full JSON analysis config file ###
 4920
 4921                exomiser_analysis = os.path.join(tmp_dir, "analysis.json")
 4922                with open(exomiser_analysis, "w") as fp:
 4923                    json.dump(param_exomiser_analysis_dict, fp, indent=4)
 4924
 4925                ### SPLIT analysis and sample config files
 4926
 4927                # Splitted analysis dict
 4928                param_exomiser_analysis_dict_for_split = (
 4929                    param_exomiser_analysis_dict.copy()
 4930                )
 4931
 4932                # Phenopacket JSON file
 4933                exomiser_analysis_phenopacket = os.path.join(
 4934                    tmp_dir, "analysis_phenopacket.json"
 4935                )
 4936                with open(exomiser_analysis_phenopacket, "w") as fp:
 4937                    json.dump(
 4938                        param_exomiser_analysis_dict_for_split.get("phenopacket"),
 4939                        fp,
 4940                        indent=4,
 4941                    )
 4942
 4943                # Analysis JSON file without Phenopacket parameters
 4944                param_exomiser_analysis_dict_for_split.pop("phenopacket")
 4945                exomiser_analysis_analysis = os.path.join(
 4946                    tmp_dir, "analysis_analysis.json"
 4947                )
 4948                with open(exomiser_analysis_analysis, "w") as fp:
 4949                    json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4)
 4950
 4951                ### INITAL VCF file ###
 4952                #######################
 4953
 4954                ### Create list of samples to use and include inti initial VCF file ####
 4955
 4956                # Subject (main sample)
 4957                # Get sample ID in analysis dict
 4958                sample_subject = (
 4959                    param_exomiser_analysis_dict.get("phenopacket", {})
 4960                    .get("subject", {})
 4961                    .get("id", None)
 4962                )
 4963                sample_proband = (
 4964                    param_exomiser_analysis_dict.get("phenopacket", {})
 4965                    .get("proband", {})
 4966                    .get("subject", {})
 4967                    .get("id", None)
 4968                )
 4969                sample = []
 4970                if sample_subject:
 4971                    sample.append(sample_subject)
 4972                if sample_proband:
 4973                    sample.append(sample_proband)
 4974
 4975                # Get sample ID within Pedigree
 4976                pedigree_persons_list = (
 4977                    param_exomiser_analysis_dict.get("phenopacket", {})
 4978                    .get("pedigree", {})
 4979                    .get("persons", {})
 4980                )
 4981
 4982                # Create list with all sample ID in pedigree (if exists)
 4983                pedigree_persons = []
 4984                for person in pedigree_persons_list:
 4985                    pedigree_persons.append(person.get("individualId"))
 4986
 4987                # Concat subject sample ID and samples ID in pedigreesamples
 4988                samples = list(set(sample + pedigree_persons))
 4989
 4990                # Check if sample list is not empty
 4991                if not samples:
 4992                    log.error(f"No samples found")
 4993                    raise ValueError(f"No samples found")
 4994
 4995                # Create VCF with sample (either sample in param or first one by default)
 4996                # Export VCF file
 4997                self.export_variant_vcf(
 4998                    vcf_file=tmp_vcf_name,
 4999                    remove_info=True,
 5000                    add_samples=True,
 5001                    list_samples=samples,
 5002                    index=False,
 5003                )
 5004
 5005                ### Execute Exomiser ###
 5006                ########################
 5007
 5008                # Init command
 5009                exomiser_command = ""
 5010
 5011                # Command exomiser options
 5012                exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} "
 5013
 5014                # Release
 5015                exomiser_release = param_exomiser.get("release", None)
 5016                if exomiser_release:
 5017                    # phenotype data version
 5018                    exomiser_options += (
 5019                        f" --exomiser.phenotype.data-version={exomiser_release} "
 5020                    )
 5021                    # data version
 5022                    exomiser_options += (
 5023                        f" --exomiser.{assembly}.data-version={exomiser_release} "
 5024                    )
 5025                    # variant white list
 5026                    variant_white_list_file = (
 5027                        f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz"
 5028                    )
 5029                    if os.path.exists(
 5030                        os.path.join(
 5031                            databases_folders, assembly, variant_white_list_file
 5032                        )
 5033                    ):
 5034                        exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} "
 5035
 5036                # transcript_source
 5037                transcript_source = param_exomiser.get(
 5038                    "transcript_source", None
 5039                )  # ucsc, refseq, ensembl
 5040                if transcript_source:
 5041                    exomiser_options += (
 5042                        f" --exomiser.{assembly}.transcript-source={transcript_source} "
 5043                    )
 5044
 5045                # If analysis contain proband param
 5046                if param_exomiser_analysis_dict.get("phenopacket", {}).get(
 5047                    "proband", {}
 5048                ):
 5049                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} "
 5050
 5051                # If no proband (usually uniq sample)
 5052                else:
 5053                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}"
 5054
 5055                # Log
 5056                log.debug(f"exomiser_command_analysis={exomiser_command_analysis}")
 5057
 5058                # Run command
 5059                result = subprocess.call(
 5060                    exomiser_command_analysis.split(), stdout=subprocess.PIPE
 5061                )
 5062                if result:
 5063                    log.error("Exomiser command failed")
 5064                    raise ValueError("Exomiser command failed")
 5065
 5066                ### RESULTS ###
 5067                ###############
 5068
 5069                ### Annotate with TSV fields ###
 5070
 5071                # Init result tsv file
 5072                exomiser_to_info = param_exomiser.get("exomiser_to_info", False)
 5073
 5074                # Init result tsv file
 5075                output_results_tsv = os.path.join(output_results, "howard.variants.tsv")
 5076
 5077                # Parse TSV file and explode columns in INFO field
 5078                if exomiser_to_info and os.path.exists(output_results_tsv):
 5079
 5080                    # Log
 5081                    log.debug("Exomiser columns to VCF INFO field")
 5082
 5083                    # Retrieve columns and types
 5084                    query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """
 5085                    output_results_tsv_df = self.get_query_to_df(query)
 5086                    output_results_tsv_columns = output_results_tsv_df.columns.tolist()
 5087
 5088                    # Init concat fields for update
 5089                    sql_query_update_concat_fields = []
 5090
 5091                    # Fields to avoid
 5092                    fields_to_avoid = [
 5093                        "CONTIG",
 5094                        "START",
 5095                        "END",
 5096                        "REF",
 5097                        "ALT",
 5098                        "QUAL",
 5099                        "FILTER",
 5100                        "GENOTYPE",
 5101                    ]
 5102
 5103                    # List all columns to add into header
 5104                    for header_column in output_results_tsv_columns:
 5105
 5106                        # If header column is enable
 5107                        if header_column not in fields_to_avoid:
 5108
 5109                            # Header info type
 5110                            header_info_type = "String"
 5111                            header_column_df = output_results_tsv_df[header_column]
 5112                            header_column_df_dtype = header_column_df.dtype
 5113                            if header_column_df_dtype == object:
 5114                                if (
 5115                                    pd.to_numeric(header_column_df, errors="coerce")
 5116                                    .notnull()
 5117                                    .all()
 5118                                ):
 5119                                    header_info_type = "Float"
 5120                            else:
 5121                                header_info_type = "Integer"
 5122
 5123                            # Header info
 5124                            characters_to_validate = ["-"]
 5125                            pattern = "[" + "".join(characters_to_validate) + "]"
 5126                            header_info_name = re.sub(
 5127                                pattern,
 5128                                "_",
 5129                                f"Exomiser_{header_column}".replace("#", ""),
 5130                            )
 5131                            header_info_number = "."
 5132                            header_info_description = (
 5133                                f"Exomiser {header_column} annotation"
 5134                            )
 5135                            header_info_source = "Exomiser"
 5136                            header_info_version = "unknown"
 5137                            header_info_code = CODE_TYPE_MAP[header_info_type]
 5138                            vcf_reader.infos[header_info_name] = vcf.parser._Info(
 5139                                header_info_name,
 5140                                header_info_number,
 5141                                header_info_type,
 5142                                header_info_description,
 5143                                header_info_source,
 5144                                header_info_version,
 5145                                header_info_code,
 5146                            )
 5147
 5148                            # Add field to add for update to concat fields
 5149                            sql_query_update_concat_fields.append(
 5150                                f"""
 5151                                CASE
 5152                                    WHEN table_parquet."{header_column}" NOT IN ('','.')
 5153                                    THEN concat(
 5154                                        '{header_info_name}=',
 5155                                        table_parquet."{header_column}",
 5156                                        ';'
 5157                                        )
 5158
 5159                                    ELSE ''
 5160                                END
 5161                            """
 5162                            )
 5163
 5164                    # Update query
 5165                    sql_query_update = f"""
 5166                        UPDATE {table_variants} as table_variants
 5167                            SET INFO = concat(
 5168                                            CASE
 5169                                                WHEN INFO NOT IN ('', '.')
 5170                                                THEN INFO
 5171                                                ELSE ''
 5172                                            END,
 5173                                            CASE
 5174                                                WHEN table_variants.INFO NOT IN ('','.')
 5175                                                THEN ';'
 5176                                                ELSE ''
 5177                                            END,
 5178                                            (
 5179                                            SELECT 
 5180                                                concat(
 5181                                                    {",".join(sql_query_update_concat_fields)}
 5182                                                )
 5183                                            FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet
 5184                                                    WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\"
 5185                                                    AND table_parquet.\"START\" = table_variants.\"POS\"
 5186                                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
 5187                                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
 5188                                            )
 5189                                        )
 5190                            ;
 5191                        """
 5192
 5193                    # Update
 5194                    self.conn.execute(sql_query_update)
 5195
 5196                ### Annotate with VCF INFO field ###
 5197
 5198                # Init result VCF file
 5199                output_results_vcf = os.path.join(output_results, "howard.vcf.gz")
 5200
 5201                # If VCF exists
 5202                if os.path.exists(output_results_vcf):
 5203
 5204                    # Log
 5205                    log.debug("Exomiser result VCF update variants")
 5206
 5207                    # Find Exomiser INFO field annotation in header
 5208                    with gzip.open(output_results_vcf, "rt") as f:
 5209                        header_list = self.read_vcf_header(f)
 5210                    exomiser_vcf_header = vcf.Reader(
 5211                        io.StringIO("\n".join(header_list))
 5212                    )
 5213
 5214                    # Add annotation INFO field to header
 5215                    vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"]
 5216
 5217                    # Update variants with VCF
 5218                    self.update_from_vcf(output_results_vcf)
 5219
 5220        return True
 5221
 5222    def annotation_snpeff(self, threads: int = None) -> None:
 5223        """
 5224        This function annotate with snpEff
 5225
 5226        :param threads: The number of threads to use
 5227        :return: the value of the variable "return_value".
 5228        """
 5229
 5230        # DEBUG
 5231        log.debug("Start annotation with snpeff databases")
 5232
 5233        # Threads
 5234        if not threads:
 5235            threads = self.get_threads()
 5236        log.debug("Threads: " + str(threads))
 5237
 5238        # DEBUG
 5239        delete_tmp = True
 5240        if self.get_config().get("verbosity", "warning") in ["debug"]:
 5241            delete_tmp = False
 5242            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 5243
 5244        # Config
 5245        config = self.get_config()
 5246        log.debug("Config: " + str(config))
 5247
 5248        # Config - Folders - Databases
 5249        databases_folders = (
 5250            config.get("folders", {}).get("databases", {}).get("snpeff", ["."])
 5251        )
 5252        log.debug("Databases annotations: " + str(databases_folders))
 5253
 5254        # Config - snpEff bin command
 5255        snpeff_bin_command = get_bin_command(
 5256            bin="snpEff.jar",
 5257            tool="snpeff",
 5258            bin_type="jar",
 5259            config=config,
 5260            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
 5261        )
 5262        if not snpeff_bin_command:
 5263            msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'"
 5264            log.error(msg_err)
 5265            raise ValueError(msg_err)
 5266
 5267        # Config - snpEff databases
 5268        snpeff_databases = (
 5269            config.get("folders", {})
 5270            .get("databases", {})
 5271            .get("snpeff", DEFAULT_SNPEFF_FOLDER)
 5272        )
 5273        snpeff_databases = full_path(snpeff_databases)
 5274        if snpeff_databases is not None and snpeff_databases != "":
 5275            log.debug(f"Create snpEff databases folder")
 5276            if not os.path.exists(snpeff_databases):
 5277                os.makedirs(snpeff_databases)
 5278
 5279        # Param
 5280        param = self.get_param()
 5281        log.debug("Param: " + str(param))
 5282
 5283        # Param
 5284        options = param.get("annotation", {}).get("snpeff", {}).get("options", None)
 5285        log.debug("Options: " + str(options))
 5286
 5287        # Param - Assembly
 5288        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 5289
 5290        # Param - Options
 5291        snpeff_options = (
 5292            param.get("annotation", {}).get("snpeff", {}).get("options", "")
 5293        )
 5294        snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None)
 5295        snpeff_csvstats = (
 5296            param.get("annotation", {}).get("snpeff", {}).get("csvStats", None)
 5297        )
 5298        if snpeff_stats:
 5299            snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output())
 5300            snpeff_stats = full_path(snpeff_stats)
 5301            snpeff_options += f" -stats {snpeff_stats}"
 5302        if snpeff_csvstats:
 5303            snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output())
 5304            snpeff_csvstats = full_path(snpeff_csvstats)
 5305            snpeff_options += f" -csvStats {snpeff_csvstats}"
 5306
 5307        # Data
 5308        table_variants = self.get_table_variants()
 5309
 5310        # Check if not empty
 5311        log.debug("Check if not empty")
 5312        sql_query_chromosomes = (
 5313            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 5314        )
 5315        # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]:
 5316        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
 5317            log.info(f"VCF empty")
 5318            return
 5319
 5320        # Export in VCF
 5321        log.debug("Create initial file to annotate")
 5322        tmp_vcf = NamedTemporaryFile(
 5323            prefix=self.get_prefix(),
 5324            dir=self.get_tmp_dir(),
 5325            suffix=".vcf.gz",
 5326            delete=True,
 5327        )
 5328        tmp_vcf_name = tmp_vcf.name
 5329
 5330        # VCF header
 5331        vcf_reader = self.get_header()
 5332        log.debug("Initial header: " + str(vcf_reader.infos))
 5333
 5334        # Existing annotations
 5335        for vcf_annotation in self.get_header().infos:
 5336
 5337            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 5338            log.debug(
 5339                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 5340            )
 5341
 5342        # Memory limit
 5343        # if config.get("memory", None):
 5344        #     memory_limit = config.get("memory", "8G")
 5345        # else:
 5346        #     memory_limit = "8G"
 5347        memory_limit = self.get_memory("8G")
 5348        log.debug(f"memory_limit: {memory_limit}")
 5349
 5350        # snpEff java options
 5351        snpeff_java_options = (
 5352            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
 5353        )
 5354        log.debug(f"Exomiser java options: {snpeff_java_options}")
 5355
 5356        force_update_annotation = True
 5357
 5358        if "ANN" not in self.get_header().infos or force_update_annotation:
 5359
 5360            # Check snpEff database
 5361            log.debug(f"Check snpEff databases {[assembly]}")
 5362            databases_download_snpeff(
 5363                folder=snpeff_databases, assemblies=[assembly], config=config
 5364            )
 5365
 5366            # Export VCF file
 5367            self.export_variant_vcf(
 5368                vcf_file=tmp_vcf_name,
 5369                remove_info=True,
 5370                add_samples=False,
 5371                index=True,
 5372            )
 5373
 5374            # Tmp file
 5375            err_files = []
 5376            tmp_annotate_vcf = NamedTemporaryFile(
 5377                prefix=self.get_prefix(),
 5378                dir=self.get_tmp_dir(),
 5379                suffix=".vcf",
 5380                delete=False,
 5381            )
 5382            tmp_annotate_vcf_name = tmp_annotate_vcf.name
 5383            tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
 5384            err_files.append(tmp_annotate_vcf_name_err)
 5385
 5386            # Command
 5387            snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}"
 5388            log.debug(f"Annotation - snpEff command: {snpeff_command}")
 5389            run_parallel_commands([snpeff_command], 1)
 5390
 5391            # Error messages
 5392            log.info(f"Error/Warning messages:")
 5393            error_message_command_all = []
 5394            error_message_command_warning = []
 5395            error_message_command_err = []
 5396            for err_file in err_files:
 5397                with open(err_file, "r") as f:
 5398                    for line in f:
 5399                        message = line.strip()
 5400                        error_message_command_all.append(message)
 5401                        if line.startswith("[W::"):
 5402                            error_message_command_warning.append(message)
 5403                        if line.startswith("[E::"):
 5404                            error_message_command_err.append(f"{err_file}: " + message)
 5405            # log info
 5406            for message in list(
 5407                set(error_message_command_err + error_message_command_warning)
 5408            ):
 5409                log.info(f"   {message}")
 5410            # debug info
 5411            for message in list(set(error_message_command_all)):
 5412                log.debug(f"   {message}")
 5413            # failed
 5414            if len(error_message_command_err):
 5415                log.error("Annotation failed: Error in commands")
 5416                raise ValueError("Annotation failed: Error in commands")
 5417
 5418            # Find annotation in header
 5419            with open(tmp_annotate_vcf_name, "rt") as f:
 5420                header_list = self.read_vcf_header(f)
 5421            annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
 5422
 5423            for ann in annovar_vcf_header.infos:
 5424                if ann not in self.get_header().infos:
 5425                    vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
 5426
 5427            # Update variants
 5428            log.info(f"Annotation - Updating...")
 5429            self.update_from_vcf(tmp_annotate_vcf_name)
 5430
 5431        else:
 5432            if "ANN" in self.get_header().infos:
 5433                log.debug(f"Existing snpEff annotations in VCF")
 5434            if force_update_annotation:
 5435                log.debug(f"Existing snpEff annotations in VCF - annotation forced")
 5436
 5437    def annotation_annovar(self, threads: int = None) -> None:
 5438        """
 5439        It takes a VCF file, annotates it with Annovar, and then updates the database with the new
 5440        annotations
 5441
 5442        :param threads: number of threads to use
 5443        :return: the value of the variable "return_value".
 5444        """
 5445
 5446        # DEBUG
 5447        log.debug("Start annotation with Annovar databases")
 5448
 5449        # Threads
 5450        if not threads:
 5451            threads = self.get_threads()
 5452        log.debug("Threads: " + str(threads))
 5453
 5454        # Tmp en Err files
 5455        tmp_files = []
 5456        err_files = []
 5457
 5458        # DEBUG
 5459        delete_tmp = True
 5460        if self.get_config().get("verbosity", "warning") in ["debug"]:
 5461            delete_tmp = False
 5462            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 5463
 5464        # Config
 5465        config = self.get_config()
 5466        log.debug("Config: " + str(config))
 5467
 5468        # Config - Folders - Databases
 5469        databases_folders = (
 5470            config.get("folders", {}).get("databases", {}).get("annovar", ["."])
 5471        )
 5472        log.debug("Databases annotations: " + str(databases_folders))
 5473
 5474        # Config - annovar bin command
 5475        annovar_bin_command = get_bin_command(
 5476            bin="table_annovar.pl",
 5477            tool="annovar",
 5478            bin_type="perl",
 5479            config=config,
 5480            default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar",
 5481        )
 5482        if not annovar_bin_command:
 5483            msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'"
 5484            log.error(msg_err)
 5485            raise ValueError(msg_err)
 5486
 5487        # Config - BCFTools bin command
 5488        bcftools_bin_command = get_bin_command(
 5489            bin="bcftools",
 5490            tool="bcftools",
 5491            bin_type="bin",
 5492            config=config,
 5493            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
 5494        )
 5495        if not bcftools_bin_command:
 5496            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
 5497            log.error(msg_err)
 5498            raise ValueError(msg_err)
 5499
 5500        # Config - annovar databases
 5501        annovar_databases = (
 5502            config.get("folders", {})
 5503            .get("databases", {})
 5504            .get("annovar", DEFAULT_ANNOVAR_FOLDER)
 5505        )
 5506        if annovar_databases is not None:
 5507            if isinstance(annovar_databases, list):
 5508                annovar_databases = full_path(annovar_databases[0])
 5509                log.warning(f"Annovar databases folder '{annovar_databases}' selected")
 5510            annovar_databases = full_path(annovar_databases)
 5511            if not os.path.exists(annovar_databases):
 5512                log.info(f"Annovar databases folder '{annovar_databases}' created")
 5513                Path(annovar_databases).mkdir(parents=True, exist_ok=True)
 5514        else:
 5515            msg_err = f"Annovar databases configuration failed"
 5516            log.error(msg_err)
 5517            raise ValueError(msg_err)
 5518
 5519        # Param
 5520        param = self.get_param()
 5521        log.debug("Param: " + str(param))
 5522
 5523        # Param - options
 5524        options = param.get("annotation", {}).get("annovar", {}).get("options", {})
 5525        log.debug("Options: " + str(options))
 5526
 5527        # Param - annotations
 5528        annotations = (
 5529            param.get("annotation", {}).get("annovar", {}).get("annotations", {})
 5530        )
 5531        log.debug("Annotations: " + str(annotations))
 5532
 5533        # Param - Assembly
 5534        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 5535
 5536        # Annovar database assembly
 5537        annovar_databases_assembly = f"{annovar_databases}/{assembly}"
 5538        if annovar_databases_assembly != "" and not os.path.exists(
 5539            annovar_databases_assembly
 5540        ):
 5541            os.makedirs(annovar_databases_assembly)
 5542
 5543        # Data
 5544        table_variants = self.get_table_variants()
 5545
 5546        # Check if not empty
 5547        log.debug("Check if not empty")
 5548        sql_query_chromosomes = (
 5549            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 5550        )
 5551        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 5552        if not sql_query_chromosomes_df["count"][0]:
 5553            log.info(f"VCF empty")
 5554            return
 5555
 5556        # VCF header
 5557        vcf_reader = self.get_header()
 5558        log.debug("Initial header: " + str(vcf_reader.infos))
 5559
 5560        # Existing annotations
 5561        for vcf_annotation in self.get_header().infos:
 5562
 5563            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 5564            log.debug(
 5565                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 5566            )
 5567
 5568        force_update_annotation = True
 5569
 5570        if annotations:
 5571
 5572            commands = []
 5573            tmp_annotates_vcf_name_list = []
 5574
 5575            # Export in VCF
 5576            log.debug("Create initial file to annotate")
 5577            tmp_vcf = NamedTemporaryFile(
 5578                prefix=self.get_prefix(),
 5579                dir=self.get_tmp_dir(),
 5580                suffix=".vcf.gz",
 5581                delete=False,
 5582            )
 5583            tmp_vcf_name = tmp_vcf.name
 5584            tmp_files.append(tmp_vcf_name)
 5585            tmp_files.append(tmp_vcf_name + ".tbi")
 5586
 5587            # Export VCF file
 5588            self.export_variant_vcf(
 5589                vcf_file=tmp_vcf_name,
 5590                remove_info=".",
 5591                add_samples=False,
 5592                index=True,
 5593            )
 5594
 5595            # Create file for field rename
 5596            log.debug("Create file for field rename")
 5597            tmp_rename = NamedTemporaryFile(
 5598                prefix=self.get_prefix(),
 5599                dir=self.get_tmp_dir(),
 5600                suffix=".rename",
 5601                delete=False,
 5602            )
 5603            tmp_rename_name = tmp_rename.name
 5604            tmp_files.append(tmp_rename_name)
 5605
 5606            # Check Annovar database
 5607            log.debug(
 5608                f"Check Annovar databases {[assembly]}: {list(annotations.keys())}"
 5609            )
 5610            databases_download_annovar(
 5611                folder=annovar_databases,
 5612                files=list(annotations.keys()),
 5613                assemblies=[assembly],
 5614            )
 5615
 5616            for annotation in annotations:
 5617                annotation_fields = annotations[annotation]
 5618
 5619                if not annotation_fields:
 5620                    annotation_fields = {"INFO": None}
 5621
 5622                log.info(f"Annotations Annovar - database '{annotation}'")
 5623                log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}")
 5624
 5625                # Tmp file for annovar
 5626                err_files = []
 5627                tmp_annotate_vcf_directory = TemporaryDirectory(
 5628                    prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar"
 5629                )
 5630                tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar"
 5631                tmp_annotate_vcf_name_annovar = (
 5632                    tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf"
 5633                )
 5634                tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err"
 5635                err_files.append(tmp_annotate_vcf_name_err)
 5636                tmp_files.append(tmp_annotate_vcf_name_err)
 5637
 5638                # Tmp file final vcf annotated by annovar
 5639                tmp_annotate_vcf = NamedTemporaryFile(
 5640                    prefix=self.get_prefix(),
 5641                    dir=self.get_tmp_dir(),
 5642                    suffix=".vcf.gz",
 5643                    delete=False,
 5644                )
 5645                tmp_annotate_vcf_name = tmp_annotate_vcf.name
 5646                tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name)
 5647                tmp_files.append(tmp_annotate_vcf_name)
 5648                tmp_files.append(tmp_annotate_vcf_name + ".tbi")
 5649
 5650                # Number of fields
 5651                annotation_list = []
 5652                annotation_renamed_list = []
 5653
 5654                for annotation_field in annotation_fields:
 5655
 5656                    # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
 5657                    annotation_fields_new_name = annotation_fields.get(
 5658                        annotation_field, annotation_field
 5659                    )
 5660                    if not annotation_fields_new_name:
 5661                        annotation_fields_new_name = annotation_field
 5662
 5663                    if (
 5664                        force_update_annotation
 5665                        or annotation_fields_new_name not in self.get_header().infos
 5666                    ):
 5667                        annotation_list.append(annotation_field)
 5668                        annotation_renamed_list.append(annotation_fields_new_name)
 5669                    else:  # annotation_fields_new_name in self.get_header().infos and not force_update_annotation:
 5670                        log.warning(
 5671                            f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
 5672                        )
 5673
 5674                    # Add rename info
 5675                    run_parallel_commands(
 5676                        [
 5677                            f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}"
 5678                        ],
 5679                        1,
 5680                    )
 5681
 5682                # log.debug("fields_to_removed: " + str(fields_to_removed))
 5683                log.debug("annotation_list: " + str(annotation_list))
 5684
 5685                # protocol
 5686                protocol = annotation
 5687
 5688                # argument
 5689                argument = ""
 5690
 5691                # operation
 5692                operation = "f"
 5693                if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith(
 5694                    "ensGene"
 5695                ):
 5696                    operation = "g"
 5697                    if options.get("genebase", None):
 5698                        argument = f"""'{options.get("genebase","")}'"""
 5699                elif annotation in ["cytoBand"]:
 5700                    operation = "r"
 5701
 5702                # argument option
 5703                argument_option = ""
 5704                if argument != "":
 5705                    argument_option = " --argument " + argument
 5706
 5707                # command options
 5708                command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """  # --intronhgvs 10
 5709                for option in options:
 5710                    if option not in ["genebase"]:
 5711                        command_options += f""" --{option}={options[option]}"""
 5712
 5713                # Command
 5714
 5715                # Command - Annovar
 5716                command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """
 5717                tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf")
 5718
 5719                # Command - start pipe
 5720                command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """
 5721
 5722                # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!)
 5723                command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """
 5724
 5725                # Command - Special characters (refGene annotation)
 5726                command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """
 5727
 5728                # Command - Clean empty fields (with value ".")
 5729                command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """
 5730
 5731                # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file
 5732                annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"]
 5733                if "ALL" not in annotation_list and "INFO" not in annotation_list:
 5734                    # for ann in annotation_renamed_list:
 5735                    for ann in annotation_list:
 5736                        annovar_fields_to_keep.append(f"^INFO/{ann}")
 5737
 5738                command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """
 5739
 5740                # Command - indexing
 5741                command_annovar += f"""  && tabix {tmp_annotate_vcf_name} """
 5742
 5743                log.debug(f"Annotation - Annovar command: {command_annovar}")
 5744                run_parallel_commands([command_annovar], 1)
 5745
 5746                # Error messages
 5747                log.info(f"Error/Warning messages:")
 5748                error_message_command_all = []
 5749                error_message_command_warning = []
 5750                error_message_command_err = []
 5751                for err_file in err_files:
 5752                    with open(err_file, "r") as f:
 5753                        for line in f:
 5754                            message = line.strip()
 5755                            error_message_command_all.append(message)
 5756                            if line.startswith("[W::") or line.startswith("WARNING"):
 5757                                error_message_command_warning.append(message)
 5758                            if line.startswith("[E::") or line.startswith("ERROR"):
 5759                                error_message_command_err.append(
 5760                                    f"{err_file}: " + message
 5761                                )
 5762                # log info
 5763                for message in list(
 5764                    set(error_message_command_err + error_message_command_warning)
 5765                ):
 5766                    log.info(f"   {message}")
 5767                # debug info
 5768                for message in list(set(error_message_command_all)):
 5769                    log.debug(f"   {message}")
 5770                # failed
 5771                if len(error_message_command_err):
 5772                    log.error("Annotation failed: Error in commands")
 5773                    raise ValueError("Annotation failed: Error in commands")
 5774
 5775            if tmp_annotates_vcf_name_list:
 5776
 5777                # List of annotated files
 5778                tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list)
 5779
 5780                # Tmp file
 5781                tmp_annotate_vcf = NamedTemporaryFile(
 5782                    prefix=self.get_prefix(),
 5783                    dir=self.get_tmp_dir(),
 5784                    suffix=".vcf.gz",
 5785                    delete=False,
 5786                )
 5787                tmp_annotate_vcf_name = tmp_annotate_vcf.name
 5788                tmp_files.append(tmp_annotate_vcf_name)
 5789                tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
 5790                err_files.append(tmp_annotate_vcf_name_err)
 5791                tmp_files.append(tmp_annotate_vcf_name_err)
 5792
 5793                # Command merge
 5794                merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} "
 5795                log.info(
 5796                    f"Annotation Annovar - Annotation merging "
 5797                    + str(len(tmp_annotates_vcf_name_list))
 5798                    + " annotated files"
 5799                )
 5800                log.debug(f"Annotation - merge command: {merge_command}")
 5801                run_parallel_commands([merge_command], 1)
 5802
 5803                # Find annotation in header
 5804                with bgzf.open(tmp_annotate_vcf_name, "rt") as f:
 5805                    header_list = self.read_vcf_header(f)
 5806                annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
 5807
 5808                for ann in annovar_vcf_header.infos:
 5809                    if ann not in self.get_header().infos:
 5810                        vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
 5811
 5812                # Update variants
 5813                log.info(f"Annotation Annovar - Updating...")
 5814                self.update_from_vcf(tmp_annotate_vcf_name)
 5815
 5816            # Clean files
 5817            # Tmp file remove command
 5818            if True:
 5819                tmp_files_remove_command = ""
 5820                if tmp_files:
 5821                    tmp_files_remove_command = " ".join(tmp_files)
 5822                clean_command = f" rm -f {tmp_files_remove_command} "
 5823                log.debug(f"Annotation Annovar - Annotation cleaning ")
 5824                log.debug(f"Annotation - cleaning command: {clean_command}")
 5825                run_parallel_commands([clean_command], 1)
 5826
 5827    # Parquet
 5828    def annotation_parquet(self, threads: int = None) -> None:
 5829        """
 5830        It takes a VCF file, and annotates it with a parquet file
 5831
 5832        :param threads: number of threads to use for the annotation
 5833        :return: the value of the variable "result".
 5834        """
 5835
 5836        # DEBUG
 5837        log.debug("Start annotation with parquet databases")
 5838
 5839        # Threads
 5840        if not threads:
 5841            threads = self.get_threads()
 5842        log.debug("Threads: " + str(threads))
 5843
 5844        # DEBUG
 5845        delete_tmp = True
 5846        if self.get_config().get("verbosity", "warning") in ["debug"]:
 5847            delete_tmp = False
 5848            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 5849
 5850        # Config
 5851        databases_folders = set(
 5852            self.get_config()
 5853            .get("folders", {})
 5854            .get("databases", {})
 5855            .get("annotations", ["."])
 5856            + self.get_config()
 5857            .get("folders", {})
 5858            .get("databases", {})
 5859            .get("parquet", ["."])
 5860        )
 5861        log.debug("Databases annotations: " + str(databases_folders))
 5862
 5863        # Param
 5864        annotations = (
 5865            self.get_param()
 5866            .get("annotation", {})
 5867            .get("parquet", {})
 5868            .get("annotations", None)
 5869        )
 5870        log.debug("Annotations: " + str(annotations))
 5871
 5872        # Assembly
 5873        assembly = self.get_param().get(
 5874            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 5875        )
 5876
 5877        # Force Update Annotation
 5878        force_update_annotation = (
 5879            self.get_param()
 5880            .get("annotation", {})
 5881            .get("options", {})
 5882            .get("annotations_update", False)
 5883        )
 5884        log.debug(f"force_update_annotation={force_update_annotation}")
 5885        force_append_annotation = (
 5886            self.get_param()
 5887            .get("annotation", {})
 5888            .get("options", {})
 5889            .get("annotations_append", False)
 5890        )
 5891        log.debug(f"force_append_annotation={force_append_annotation}")
 5892
 5893        # Data
 5894        table_variants = self.get_table_variants()
 5895
 5896        # Check if not empty
 5897        log.debug("Check if not empty")
 5898        sql_query_chromosomes_df = self.get_query_to_df(
 5899            f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1"""
 5900        )
 5901        if not sql_query_chromosomes_df["count"][0]:
 5902            log.info(f"VCF empty")
 5903            return
 5904
 5905        # VCF header
 5906        vcf_reader = self.get_header()
 5907        log.debug("Initial header: " + str(vcf_reader.infos))
 5908
 5909        # Nb Variants POS
 5910        log.debug("NB Variants Start")
 5911        nb_variants = self.conn.execute(
 5912            f"SELECT count(*) AS count FROM variants"
 5913        ).fetchdf()["count"][0]
 5914        log.debug("NB Variants Stop")
 5915
 5916        # Existing annotations
 5917        for vcf_annotation in self.get_header().infos:
 5918
 5919            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 5920            log.debug(
 5921                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 5922            )
 5923
 5924        # Added columns
 5925        added_columns = []
 5926
 5927        # drop indexes
 5928        log.debug(f"Drop indexes...")
 5929        self.drop_indexes()
 5930
 5931        if annotations:
 5932
 5933            if "ALL" in annotations:
 5934
 5935                all_param = annotations.get("ALL", {})
 5936                all_param_formats = all_param.get("formats", None)
 5937                all_param_releases = all_param.get("releases", None)
 5938
 5939                databases_infos_dict = self.scan_databases(
 5940                    database_formats=all_param_formats,
 5941                    database_releases=all_param_releases,
 5942                )
 5943                for database_infos in databases_infos_dict.keys():
 5944                    if database_infos not in annotations:
 5945                        annotations[database_infos] = {"INFO": None}
 5946
 5947            for annotation in annotations:
 5948
 5949                if annotation in ["ALL"]:
 5950                    continue
 5951
 5952                # Annotation Name
 5953                annotation_name = os.path.basename(annotation)
 5954
 5955                # Annotation fields
 5956                annotation_fields = annotations[annotation]
 5957                if not annotation_fields:
 5958                    annotation_fields = {"INFO": None}
 5959
 5960                log.debug(f"Annotation '{annotation_name}'")
 5961                log.debug(
 5962                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 5963                )
 5964
 5965                # Create Database
 5966                database = Database(
 5967                    database=annotation,
 5968                    databases_folders=databases_folders,
 5969                    assembly=assembly,
 5970                )
 5971
 5972                # Find files
 5973                parquet_file = database.get_database()
 5974                parquet_hdr_file = database.get_header_file()
 5975                parquet_type = database.get_type()
 5976
 5977                # Check if files exists
 5978                if not parquet_file or not parquet_hdr_file:
 5979                    msg_err_list = []
 5980                    if not parquet_file:
 5981                        msg_err_list.append(
 5982                            f"Annotation failed: Annotation file not found"
 5983                        )
 5984                    if parquet_file and not parquet_hdr_file:
 5985                        msg_err_list.append(
 5986                            f"Annotation failed: Annotation file '{parquet_file}' header not found. Check for file '{parquet_file}.hdr'"
 5987                        )
 5988
 5989                    log.error(". ".join(msg_err_list))
 5990                    raise ValueError(". ".join(msg_err_list))
 5991                else:
 5992                    # Get parquet connexion
 5993                    parquet_sql_attach = database.get_sql_database_attach(
 5994                        output="query"
 5995                    )
 5996                    if parquet_sql_attach:
 5997                        self.conn.execute(parquet_sql_attach)
 5998                    parquet_file_link = database.get_sql_database_link()
 5999                    # Log
 6000                    log.debug(
 6001                        f"Annotation '{annotation_name}' - file: "
 6002                        + str(parquet_file)
 6003                        + " and "
 6004                        + str(parquet_hdr_file)
 6005                    )
 6006
 6007                    # Database full header columns
 6008                    parquet_hdr_vcf_header_columns = database.get_header_file_columns(
 6009                        parquet_hdr_file
 6010                    )
 6011                    # Log
 6012                    log.debug(
 6013                        "Annotation database header columns : "
 6014                        + str(parquet_hdr_vcf_header_columns)
 6015                    )
 6016
 6017                    # Load header as VCF object
 6018                    parquet_hdr_vcf_header_infos = database.get_header().infos
 6019                    # Log
 6020                    log.debug(
 6021                        "Annotation database header: "
 6022                        + str(parquet_hdr_vcf_header_infos)
 6023                    )
 6024
 6025                    # Get extra infos
 6026                    parquet_columns = database.get_extra_columns()
 6027                    # Log
 6028                    log.debug("Annotation database Columns: " + str(parquet_columns))
 6029
 6030                    # Add extra columns if "ALL" in annotation_fields
 6031                    # if "ALL" in annotation_fields:
 6032                    #     allow_add_extra_column = True
 6033                    if "ALL" in annotation_fields and database.get_extra_columns():
 6034                        for extra_column in database.get_extra_columns():
 6035                            if (
 6036                                extra_column not in annotation_fields
 6037                                and extra_column.replace("INFO/", "")
 6038                                not in parquet_hdr_vcf_header_infos
 6039                            ):
 6040                                parquet_hdr_vcf_header_infos[extra_column] = (
 6041                                    vcf.parser._Info(
 6042                                        extra_column,
 6043                                        ".",
 6044                                        "String",
 6045                                        f"{extra_column} description",
 6046                                        "unknown",
 6047                                        "unknown",
 6048                                        self.code_type_map["String"],
 6049                                    )
 6050                                )
 6051
 6052                    # For all fields in database
 6053                    annotation_fields_all = False
 6054                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
 6055                        annotation_fields_all = True
 6056                        annotation_fields = {
 6057                            key: key for key in parquet_hdr_vcf_header_infos
 6058                        }
 6059
 6060                        log.debug(
 6061                            "Annotation database header - All annotations added: "
 6062                            + str(annotation_fields)
 6063                        )
 6064
 6065                    # Init
 6066
 6067                    # List of annotation fields to use
 6068                    sql_query_annotation_update_info_sets = []
 6069
 6070                    # List of annotation to agregate
 6071                    sql_query_annotation_to_agregate = []
 6072
 6073                    # Number of fields
 6074                    nb_annotation_field = 0
 6075
 6076                    # Annotation fields processed
 6077                    annotation_fields_processed = []
 6078
 6079                    # Columns mapping
 6080                    map_columns = database.map_columns(
 6081                        columns=annotation_fields, prefixes=["INFO/"]
 6082                    )
 6083
 6084                    # Query dict for fields to remove (update option)
 6085                    query_dict_remove = {}
 6086
 6087                    # Fetch Anotation fields
 6088                    for annotation_field in annotation_fields:
 6089
 6090                        # annotation_field_column
 6091                        annotation_field_column = map_columns.get(
 6092                            annotation_field, "INFO"
 6093                        )
 6094
 6095                        # field new name, if parametered
 6096                        annotation_fields_new_name = annotation_fields.get(
 6097                            annotation_field, annotation_field
 6098                        )
 6099                        if not annotation_fields_new_name:
 6100                            annotation_fields_new_name = annotation_field
 6101
 6102                        # To annotate
 6103                        # force_update_annotation = True
 6104                        # force_append_annotation = True
 6105                        # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)):
 6106                        if annotation_field in parquet_hdr_vcf_header_infos and (
 6107                            force_update_annotation
 6108                            or force_append_annotation
 6109                            or (
 6110                                annotation_fields_new_name
 6111                                not in self.get_header().infos
 6112                            )
 6113                        ):
 6114
 6115                            # Add field to annotation to process list
 6116                            annotation_fields_processed.append(
 6117                                annotation_fields_new_name
 6118                            )
 6119
 6120                            # explode infos for the field
 6121                            annotation_fields_new_name_info_msg = ""
 6122                            if (
 6123                                force_update_annotation
 6124                                and annotation_fields_new_name
 6125                                in self.get_header().infos
 6126                            ):
 6127                                # Remove field from INFO
 6128                                query = f"""
 6129                                    UPDATE {table_variants} as table_variants
 6130                                    SET INFO = REGEXP_REPLACE(
 6131                                                concat(table_variants.INFO,''),
 6132                                                ';*{annotation_fields_new_name}=[^;]*',
 6133                                                ''
 6134                                                )
 6135                                    WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%'
 6136                                """
 6137                                annotation_fields_new_name_info_msg = " [update]"
 6138                                query_dict_remove[
 6139                                    f"remove 'INFO/{annotation_fields_new_name}'"
 6140                                ] = query
 6141
 6142                            # Sep between fields in INFO
 6143                            nb_annotation_field += 1
 6144                            if nb_annotation_field > 1:
 6145                                annotation_field_sep = ";"
 6146                            else:
 6147                                annotation_field_sep = ""
 6148
 6149                            log.info(
 6150                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}"
 6151                            )
 6152
 6153                            # Add INFO field to header
 6154                            parquet_hdr_vcf_header_infos_number = (
 6155                                parquet_hdr_vcf_header_infos[annotation_field].num
 6156                                or "."
 6157                            )
 6158                            parquet_hdr_vcf_header_infos_type = (
 6159                                parquet_hdr_vcf_header_infos[annotation_field].type
 6160                                or "String"
 6161                            )
 6162                            parquet_hdr_vcf_header_infos_description = (
 6163                                parquet_hdr_vcf_header_infos[annotation_field].desc
 6164                                or f"{annotation_field} description"
 6165                            )
 6166                            parquet_hdr_vcf_header_infos_source = (
 6167                                parquet_hdr_vcf_header_infos[annotation_field].source
 6168                                or "unknown"
 6169                            )
 6170                            parquet_hdr_vcf_header_infos_version = (
 6171                                parquet_hdr_vcf_header_infos[annotation_field].version
 6172                                or "unknown"
 6173                            )
 6174
 6175                            vcf_reader.infos[annotation_fields_new_name] = (
 6176                                vcf.parser._Info(
 6177                                    annotation_fields_new_name,
 6178                                    parquet_hdr_vcf_header_infos_number,
 6179                                    parquet_hdr_vcf_header_infos_type,
 6180                                    parquet_hdr_vcf_header_infos_description,
 6181                                    parquet_hdr_vcf_header_infos_source,
 6182                                    parquet_hdr_vcf_header_infos_version,
 6183                                    self.code_type_map[
 6184                                        parquet_hdr_vcf_header_infos_type
 6185                                    ],
 6186                                )
 6187                            )
 6188
 6189                            # Append
 6190                            if force_append_annotation:
 6191                                query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """
 6192                            else:
 6193                                query_case_when_append = ""
 6194
 6195                            # Annotation/Update query fields
 6196                            # Found in INFO column
 6197                            if (
 6198                                annotation_field_column == "INFO"
 6199                                and "INFO" in parquet_hdr_vcf_header_columns
 6200                            ):
 6201                                sql_query_annotation_update_info_sets.append(
 6202                                    f"""
 6203                                CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append}
 6204                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1))
 6205                                        ELSE ''
 6206                                    END
 6207                                """
 6208                                )
 6209                            # Found in a specific column
 6210                            else:
 6211                                sql_query_annotation_update_info_sets.append(
 6212                                    f"""
 6213                                CASE WHEN CAST(table_parquet."{annotation_field_column}" AS VARCHAR) NOT IN ('','.') {query_case_when_append}
 6214                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(CAST(table_parquet."{annotation_field_column}" AS VARCHAR), ';', ','))
 6215                                        ELSE ''
 6216                                    END
 6217                                """
 6218                                )
 6219                                sql_query_annotation_to_agregate.append(
 6220                                    f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """
 6221                                )
 6222
 6223                        # Not to annotate
 6224                        else:
 6225
 6226                            if force_update_annotation:
 6227                                annotation_message = "forced"
 6228                            else:
 6229                                annotation_message = "skipped"
 6230
 6231                            if annotation_field not in parquet_hdr_vcf_header_infos:
 6232                                log.warning(
 6233                                    f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file"
 6234                                )
 6235                            if annotation_fields_new_name in self.get_header().infos:
 6236                                log.warning(
 6237                                    f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})"
 6238                                )
 6239
 6240                    # Check if ALL fields have to be annotated. Thus concat all INFO field
 6241                    # allow_annotation_full_info = True
 6242                    allow_annotation_full_info = not force_append_annotation
 6243
 6244                    if parquet_type in ["regions"]:
 6245                        allow_annotation_full_info = False
 6246
 6247                    if (
 6248                        allow_annotation_full_info
 6249                        and nb_annotation_field == len(annotation_fields)
 6250                        and annotation_fields_all
 6251                        and (
 6252                            "INFO" in parquet_hdr_vcf_header_columns
 6253                            and "INFO" in database.get_extra_columns()
 6254                        )
 6255                    ):
 6256                        log.debug("Column INFO annotation enabled")
 6257                        sql_query_annotation_update_info_sets = []
 6258                        sql_query_annotation_update_info_sets.append(
 6259                            f" table_parquet.INFO "
 6260                        )
 6261
 6262                    if sql_query_annotation_update_info_sets:
 6263
 6264                        # Annotate
 6265                        log.info(f"Annotation '{annotation_name}' - Annotation...")
 6266
 6267                        # Join query annotation update info sets for SQL
 6268                        sql_query_annotation_update_info_sets_sql = ",".join(
 6269                            sql_query_annotation_update_info_sets
 6270                        )
 6271
 6272                        # Check chromosomes list (and variants infos)
 6273                        sql_query_chromosomes = f"""
 6274                            SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants
 6275                            FROM {table_variants} as table_variants
 6276                            GROUP BY table_variants."#CHROM"
 6277                            ORDER BY table_variants."#CHROM"
 6278                            """
 6279                        sql_query_chromosomes_df = self.conn.execute(
 6280                            sql_query_chromosomes
 6281                        ).df()
 6282                        sql_query_chromosomes_dict = {
 6283                            entry["CHROM"]: {
 6284                                "count": entry["count_variants"],
 6285                                "min": entry["min_variants"],
 6286                                "max": entry["max_variants"],
 6287                            }
 6288                            for index, entry in sql_query_chromosomes_df.iterrows()
 6289                        }
 6290
 6291                        # Init
 6292                        nb_of_query = 0
 6293                        nb_of_variant_annotated = 0
 6294                        query_dict = query_dict_remove
 6295
 6296                        # for chrom in sql_query_chromosomes_df["CHROM"]:
 6297                        for chrom in sql_query_chromosomes_dict:
 6298
 6299                            # Number of variant by chromosome
 6300                            nb_of_variant_by_chrom = sql_query_chromosomes_dict.get(
 6301                                chrom, {}
 6302                            ).get("count", 0)
 6303
 6304                            log.debug(
 6305                                f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..."
 6306                            )
 6307
 6308                            # Annotation with regions database
 6309                            if parquet_type in ["regions"]:
 6310                                sql_query_annotation_from_clause = f"""
 6311                                    FROM (
 6312                                        SELECT 
 6313                                            '{chrom}' AS \"#CHROM\",
 6314                                            table_variants_from.\"POS\" AS \"POS\",
 6315                                            {",".join(sql_query_annotation_to_agregate)}
 6316                                        FROM {table_variants} as table_variants_from
 6317                                        LEFT JOIN {parquet_file_link} as table_parquet_from ON (
 6318                                            table_parquet_from."#CHROM" = '{chrom}'
 6319                                            AND table_variants_from.\"POS\" <= table_parquet_from.\"END\"
 6320                                            AND table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1)
 6321                                        )
 6322                                        WHERE table_variants_from.\"#CHROM\" in ('{chrom}')
 6323                                        GROUP BY table_variants_from.\"POS\"
 6324                                        )
 6325                                        as table_parquet
 6326                                """
 6327
 6328                                sql_query_annotation_where_clause = """
 6329                                    table_parquet.\"#CHROM\" = table_variants.\"#CHROM\"
 6330                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
 6331                                """
 6332
 6333                            # Annotation with variants database
 6334                            else:
 6335                                sql_query_annotation_from_clause = f"""
 6336                                    FROM {parquet_file_link} as table_parquet
 6337                                """
 6338                                sql_query_annotation_where_clause = f"""
 6339                                    table_variants."#CHROM" = '{chrom}'
 6340                                    AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 
 6341                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
 6342                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
 6343                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
 6344                                """
 6345
 6346                            # Create update query
 6347                            sql_query_annotation_chrom_interval_pos = f"""
 6348                                UPDATE {table_variants} as table_variants
 6349                                    SET INFO = 
 6350                                        concat(
 6351                                            CASE WHEN table_variants.INFO NOT IN ('','.')
 6352                                                THEN table_variants.INFO
 6353                                                ELSE ''
 6354                                            END
 6355                                            ,
 6356                                            CASE WHEN table_variants.INFO NOT IN ('','.')
 6357                                                        AND (
 6358                                                        concat({sql_query_annotation_update_info_sets_sql})
 6359                                                        )
 6360                                                        NOT IN ('','.') 
 6361                                                    THEN ';'
 6362                                                    ELSE ''
 6363                                            END
 6364                                            ,
 6365                                            {sql_query_annotation_update_info_sets_sql}
 6366                                            )
 6367                                    {sql_query_annotation_from_clause}
 6368                                    WHERE {sql_query_annotation_where_clause}
 6369                                    ;
 6370                                """
 6371
 6372                            # Add update query to dict
 6373                            query_dict[
 6374                                f"{chrom} [{nb_of_variant_by_chrom} variants]"
 6375                            ] = sql_query_annotation_chrom_interval_pos
 6376
 6377                        nb_of_query = len(query_dict)
 6378                        num_query = 0
 6379
 6380                        # SET max_expression_depth TO x
 6381                        self.conn.execute("SET max_expression_depth TO 10000")
 6382
 6383                        for query_name in query_dict:
 6384                            query = query_dict[query_name]
 6385                            num_query += 1
 6386                            log.info(
 6387                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..."
 6388                            )
 6389                            result = self.conn.execute(query)
 6390                            nb_of_variant_annotated_by_query = result.df()["Count"][0]
 6391                            nb_of_variant_annotated += nb_of_variant_annotated_by_query
 6392                            log.info(
 6393                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated"
 6394                            )
 6395
 6396                        log.info(
 6397                            f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)"
 6398                        )
 6399
 6400                    else:
 6401
 6402                        log.info(
 6403                            f"Annotation '{annotation_name}' - No Annotations available"
 6404                        )
 6405
 6406                    log.debug("Final header: " + str(vcf_reader.infos))
 6407
 6408        # Remove added columns
 6409        for added_column in added_columns:
 6410            self.drop_column(column=added_column)
 6411
 6412    def annotation_splice(self, threads: int = None) -> None:
 6413        """
 6414        This function annotate with snpEff
 6415
 6416        :param threads: The number of threads to use
 6417        :return: the value of the variable "return_value".
 6418        """
 6419
 6420        # DEBUG
 6421        log.debug("Start annotation with splice tools")
 6422
 6423        # Threads
 6424        if not threads:
 6425            threads = self.get_threads()
 6426        log.debug("Threads: " + str(threads))
 6427
 6428        # DEBUG
 6429        delete_tmp = True
 6430        if self.get_config().get("verbosity", "warning") in ["debug"]:
 6431            delete_tmp = False
 6432            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 6433
 6434        # Config
 6435        config = self.get_config()
 6436        log.debug("Config: " + str(config))
 6437        splice_config = config.get("tools", {}).get("splice", {})
 6438        if not splice_config:
 6439            splice_config = DEFAULT_TOOLS_BIN.get("splice", {})
 6440            msg_err = "No Splice tool config"
 6441            raise ValueError(msg_err)
 6442        log.debug(f"splice_config: {splice_config}")
 6443
 6444        # Config - Folders - Databases
 6445        databases_folders = (
 6446            config.get("folders", {}).get("databases", {}).get("splice", ["."])
 6447        )
 6448        log.debug("Databases annotations: " + str(databases_folders))
 6449
 6450        # Splice docker image
 6451        splice_docker_image = splice_config.get("docker").get("image")
 6452
 6453        # Pull splice image if it's not already there
 6454        if not check_docker_image_exists(splice_docker_image):
 6455            log.warning(
 6456                f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub"
 6457            )
 6458            try:
 6459                command(f"docker pull {splice_config.get('docker').get('image')}")
 6460            except subprocess.CalledProcessError:
 6461                msg_err = f"Unable to find docker {splice_docker_image} on dockerhub"
 6462                log.error(msg_err)
 6463                raise ValueError(msg_err)
 6464
 6465        # Config - splice databases
 6466        splice_databases = (
 6467            config.get("folders", {})
 6468            .get("databases", {})
 6469            .get("splice", DEFAULT_SPLICE_FOLDER)
 6470        )
 6471        splice_databases = full_path(splice_databases)
 6472
 6473        # Param
 6474        param = self.get_param()
 6475        log.debug("Param: " + str(param))
 6476
 6477        # Param
 6478        options = param.get("annotation", {}).get("splice", {}).get("options", {})
 6479        log.debug("Options: " + str(options))
 6480
 6481        # Data
 6482        table_variants = self.get_table_variants()
 6483
 6484        # Check if not empty
 6485        log.debug("Check if not empty")
 6486        sql_query_chromosomes = (
 6487            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 6488        )
 6489        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
 6490            log.info("VCF empty")
 6491            return None
 6492
 6493        # Export in VCF
 6494        log.debug("Create initial file to annotate")
 6495
 6496        # Create output folder / work folder
 6497        if options.get("output_folder", ""):
 6498            output_folder = options.get("output_folder", "")
 6499            if not os.path.exists(output_folder):
 6500                Path(output_folder).mkdir(parents=True, exist_ok=True)
 6501        else:
 6502            output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}")
 6503            if not os.path.exists(output_folder):
 6504                Path(output_folder).mkdir(parents=True, exist_ok=True)
 6505
 6506        if options.get("workdir", ""):
 6507            workdir = options.get("workdir", "")
 6508        else:
 6509            workdir = "/work"
 6510
 6511        # Create tmp VCF file
 6512        tmp_vcf = NamedTemporaryFile(
 6513            prefix=self.get_prefix(),
 6514            dir=output_folder,
 6515            suffix=".vcf",
 6516            delete=False,
 6517        )
 6518        tmp_vcf_name = tmp_vcf.name
 6519
 6520        # VCF header
 6521        header = self.get_header()
 6522
 6523        # Existing annotations
 6524        for vcf_annotation in self.get_header().infos:
 6525
 6526            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 6527            log.debug(
 6528                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 6529            )
 6530
 6531        # Memory limit
 6532        if config.get("memory", None):
 6533            memory_limit = config.get("memory", "8G").upper()
 6534            # upper()
 6535        else:
 6536            memory_limit = "8G"
 6537        log.debug(f"memory_limit: {memory_limit}")
 6538
 6539        # Check number of variants to annotate
 6540        where_clause_regex_spliceai = r"SpliceAI_\w+"
 6541        where_clause_regex_spip = r"SPiP_\w+"
 6542        where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')"""
 6543        df_list_of_variants_to_annotate = self.get_query_to_df(
 6544            query=f""" SELECT * FROM variants {where_clause} """
 6545        )
 6546        if len(df_list_of_variants_to_annotate) == 0:
 6547            log.warning(
 6548                f"No variants to annotate with splice. Variants probably already annotated with splice"
 6549            )
 6550            return None
 6551        else:
 6552            log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants")
 6553
 6554        # Export VCF file
 6555        self.export_variant_vcf(
 6556            vcf_file=tmp_vcf_name,
 6557            remove_info=True,
 6558            add_samples=True,
 6559            index=False,
 6560            where_clause=where_clause,
 6561        )
 6562        mount = [f" -v {path}:{path}:rw" for path in [output_folder]]
 6563        if any(value for value in splice_config.values() if value is None):
 6564            log.warning("At least one splice config parameter is empty")
 6565            # exit annotation_splice
 6566            return None
 6567
 6568        # Params in splice nf
 6569        def check_values(dico: dict):
 6570            """
 6571            Ensure parameters for NF splice pipeline
 6572            """
 6573            for key, val in dico.items():
 6574                if key == "genome":
 6575                    if any(
 6576                        assemb in options.get("genome", {})
 6577                        for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"]
 6578                    ):
 6579                        yield f"--{key} hg19"
 6580                    elif any(
 6581                        assemb in options.get("genome", {})
 6582                        for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"]
 6583                    ):
 6584                        yield f"--{key} hg38"
 6585                elif (
 6586                    (isinstance(val, str) and val)
 6587                    or isinstance(val, int)
 6588                    or isinstance(val, bool)
 6589                ):
 6590                    yield f"--{key} {val}"
 6591
 6592        # Genome
 6593        genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY))
 6594        options["genome"] = genome
 6595        # NF params
 6596        nf_params = []
 6597        # Add options
 6598        if options:
 6599            log.debug(options)
 6600            nf_params = list(check_values(options))
 6601            log.debug(f"Splice NF params: {' '.join(nf_params)}")
 6602        else:
 6603            log.debug("No NF params provided")
 6604        # Add threads
 6605        if "threads" not in options.keys():
 6606            nf_params.append(f"--threads {threads}")
 6607        # Genome path
 6608        genome_path = find_genome(
 6609            config.get("folders", {})
 6610            .get("databases", {})
 6611            .get("genomes", DEFAULT_GENOME_FOLDER),
 6612            file=f"{genome}.fa",
 6613        )
 6614        # Add genome path
 6615        if not genome_path:
 6616            raise ValueError(
 6617                f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}"
 6618            )
 6619        else:
 6620            log.debug(f"Genome: {genome_path}")
 6621            nf_params.append(f"--genome_path {genome_path}")
 6622
 6623        def splice_annotations(options: dict = {}, config: dict = {}) -> list:
 6624            """
 6625            Setting up updated databases for SPiP and SpliceAI
 6626            """
 6627
 6628            try:
 6629
 6630                # SpliceAI assembly transcriptome
 6631                spliceai_assembly = os.path.join(
 6632                    config.get("folders", {}).get("databases", {}).get("spliceai", {}),
 6633                    options.get("genome"),
 6634                    "transcriptome",
 6635                )
 6636                spip_assembly = options.get("genome")
 6637
 6638                spip = find(
 6639                    f"transcriptome_{spip_assembly}.RData",
 6640                    config.get("folders", {}).get("databases", {}).get("spip", {}),
 6641                )
 6642                spliceai = find("spliceai.refseq.txt", spliceai_assembly)
 6643                log.debug(f"SPiP annotations: {spip}")
 6644                log.debug(f"SpliceAI annotations: {spliceai}")
 6645                if spip and spliceai:
 6646                    return [
 6647                        f"--spip_transcriptome {spip}",
 6648                        f"--spliceai_transcriptome {spliceai}",
 6649                    ]
 6650                else:
 6651                    log.warning(
 6652                        "Can't find splice databases in configuration, use annotations file from image"
 6653                    )
 6654            except TypeError:
 6655                log.warning(
 6656                    "Can't find splice databases in configuration, use annotations file from image"
 6657                )
 6658                return []
 6659
 6660        # Add options, check if transcriptome option have already beend provided
 6661        if (
 6662            "spip_transcriptome" not in nf_params
 6663            and "spliceai_transcriptome" not in nf_params
 6664        ):
 6665            splice_reference = splice_annotations(options, config)
 6666            if splice_reference:
 6667                nf_params.extend(splice_reference)
 6668        # nf_params.append(f"--output_folder {output_folder}")
 6669        random_uuid = f"HOWARD-SPLICE-{get_random()}"
 6670        cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline"
 6671        log.debug(cmd)
 6672        splice_config["docker"]["command"] = cmd
 6673
 6674        # Ensure proxy is set
 6675        proxy = [
 6676            f"-e {var}={os.getenv(var)}"
 6677            for var in ["https_proxy", "http_proxy", "ftp_proxy"]
 6678            if os.getenv(var) is not None
 6679        ]
 6680        docker_cmd = get_bin_command(
 6681            tool="splice",
 6682            bin_type="docker",
 6683            config=config,
 6684            default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker",
 6685            add_options=f"--name {random_uuid} {' '.join(mount)} -e NXF_DISABLE_CHECK_LATEST=true {' '.join(proxy)}",
 6686        )
 6687        # print(docker_cmd)
 6688        # exit()
 6689        # Docker debug
 6690        # if splice_config.get("rm_container"):
 6691        #     rm_container = "--rm"
 6692        # else:
 6693        #     rm_container = ""
 6694        # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}"
 6695        log.debug(docker_cmd)
 6696        res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True)
 6697        log.debug(res.stdout)
 6698        if res.stderr:
 6699            log.error(res.stderr)
 6700        res.check_returncode()
 6701        # Update variants
 6702        log.info("Annotation - Updating...")
 6703        # Test find output vcf
 6704        log.debug(
 6705            f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
 6706        )
 6707        output_vcf = []
 6708        # Wrong folder to look in
 6709        for files in os.listdir(os.path.dirname(tmp_vcf_name)):
 6710            if (
 6711                files
 6712                == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
 6713            ):
 6714                output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files))
 6715        # log.debug(os.listdir(options.get("output_folder")))
 6716        log.debug(f"Splice annotated vcf: {output_vcf[0]}")
 6717        if not output_vcf:
 6718            log.debug(
 6719                f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz"
 6720            )
 6721        else:
 6722            # Get new header from annotated vcf
 6723            log.debug(f"Initial header: {len(header.infos)} fields")
 6724            # Create new header with splice infos
 6725            new_vcf = Variants(input=output_vcf[0])
 6726            new_vcf_header = new_vcf.get_header().infos
 6727            for keys, infos in new_vcf_header.items():
 6728                if keys not in header.infos.keys():
 6729                    header.infos[keys] = infos
 6730            log.debug(f"New header: {len(header.infos)} fields")
 6731            log.debug(f"Splice tmp output: {output_vcf[0]}")
 6732            self.update_from_vcf(output_vcf[0])
 6733
 6734        # Remove file
 6735        remove_if_exists(output_vcf)
 6736
 6737    ###
 6738    # Prioritization
 6739    ###
 6740
 6741    def get_config_default(self, name: str) -> dict:
 6742        """
 6743        The function `get_config_default` returns a dictionary containing default configurations for
 6744        various calculations and prioritizations.
 6745
 6746        :param name: The `get_config_default` function returns a dictionary containing default
 6747        configurations for different calculations and prioritizations. The `name` parameter is used to
 6748        specify which specific configuration to retrieve from the dictionary
 6749        :type name: str
 6750        :return: The function `get_config_default` returns a dictionary containing default configuration
 6751        settings for different calculations and prioritizations. The specific configuration settings are
 6752        retrieved based on the input `name` parameter provided to the function. If the `name` parameter
 6753        matches a key in the `config_default` dictionary, the corresponding configuration settings are
 6754        returned. If there is no match, an empty dictionary is returned.
 6755        """
 6756
 6757        config_default = {
 6758            "calculations": {
 6759                "variant_chr_pos_alt_ref": {
 6760                    "type": "sql",
 6761                    "name": "variant_chr_pos_alt_ref",
 6762                    "description": "Create a variant ID with chromosome, position, alt and ref",
 6763                    "available": False,
 6764                    "output_column_name": "variant_chr_pos_alt_ref",
 6765                    "output_column_type": "String",
 6766                    "output_column_description": "variant ID with chromosome, position, alt and ref",
 6767                    "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """,
 6768                    "operation_info": True,
 6769                },
 6770                "VARTYPE": {
 6771                    "type": "sql",
 6772                    "name": "VARTYPE",
 6773                    "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)",
 6774                    "available": True,
 6775                    "table": "variants",
 6776                    "output_column_name": "VARTYPE",
 6777                    "output_column_type": "String",
 6778                    "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ",
 6779                    "operation_query": """
 6780                            CASE
 6781                                WHEN "SVTYPE" NOT NULL THEN "SVTYPE"
 6782                                WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV'
 6783                                WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC'
 6784                                WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV'
 6785                                WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL'
 6786                                ELSE 'UNDEFINED'
 6787                            END
 6788                            """,
 6789                    "info_fields": ["SVTYPE"],
 6790                    "operation_info": True,
 6791                },
 6792                "snpeff_hgvs": {
 6793                    "type": "python",
 6794                    "name": "snpeff_hgvs",
 6795                    "description": "HGVS nomenclatures from snpEff annotation",
 6796                    "available": True,
 6797                    "function_name": "calculation_extract_snpeff_hgvs",
 6798                    "function_params": ["snpeff_hgvs", "ANN"],
 6799                },
 6800                "snpeff_ann_explode": {
 6801                    "type": "python",
 6802                    "name": "snpeff_ann_explode",
 6803                    "description": "Explode snpEff annotations with uniquify values",
 6804                    "available": True,
 6805                    "function_name": "calculation_snpeff_ann_explode",
 6806                    "function_params": [False, "fields", "snpeff_", "ANN"],
 6807                },
 6808                "snpeff_ann_explode_uniquify": {
 6809                    "type": "python",
 6810                    "name": "snpeff_ann_explode_uniquify",
 6811                    "description": "Explode snpEff annotations",
 6812                    "available": True,
 6813                    "function_name": "calculation_snpeff_ann_explode",
 6814                    "function_params": [True, "fields", "snpeff_uniquify_", "ANN"],
 6815                },
 6816                "snpeff_ann_explode_json": {
 6817                    "type": "python",
 6818                    "name": "snpeff_ann_explode_json",
 6819                    "description": "Explode snpEff annotations in JSON format",
 6820                    "available": True,
 6821                    "function_name": "calculation_snpeff_ann_explode",
 6822                    "function_params": [False, "JSON", "snpeff_json", "ANN"],
 6823                },
 6824                "NOMEN": {
 6825                    "type": "python",
 6826                    "name": "NOMEN",
 6827                    "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field (see parameters help)",
 6828                    "available": True,
 6829                    "function_name": "calculation_extract_nomen",
 6830                    "function_params": [],
 6831                },
 6832                "FINDBYPIPELINE": {
 6833                    "type": "python",
 6834                    "name": "FINDBYPIPELINE",
 6835                    "description": "Number of pipeline that identify the variant (for multi pipeline VCF)",
 6836                    "available": True,
 6837                    "function_name": "calculation_find_by_pipeline",
 6838                    "function_params": ["findbypipeline"],
 6839                },
 6840                "FINDBYSAMPLE": {
 6841                    "type": "python",
 6842                    "name": "FINDBYSAMPLE",
 6843                    "description": "Number of sample that have a genotype for the variant (for multi sample VCF)",
 6844                    "available": True,
 6845                    "function_name": "calculation_find_by_pipeline",
 6846                    "function_params": ["findbysample"],
 6847                },
 6848                "GENOTYPECONCORDANCE": {
 6849                    "type": "python",
 6850                    "name": "GENOTYPECONCORDANCE",
 6851                    "description": "Concordance of genotype for multi caller VCF",
 6852                    "available": True,
 6853                    "function_name": "calculation_genotype_concordance",
 6854                    "function_params": [],
 6855                },
 6856                "BARCODE": {
 6857                    "type": "python",
 6858                    "name": "BARCODE",
 6859                    "description": "BARCODE as VaRank tool",
 6860                    "available": True,
 6861                    "function_name": "calculation_barcode",
 6862                    "function_params": [],
 6863                },
 6864                "BARCODEFAMILY": {
 6865                    "type": "python",
 6866                    "name": "BARCODEFAMILY",
 6867                    "description": "BARCODEFAMILY as VaRank tool",
 6868                    "available": True,
 6869                    "function_name": "calculation_barcode_family",
 6870                    "function_params": ["BCF"],
 6871                },
 6872                "TRIO": {
 6873                    "type": "python",
 6874                    "name": "TRIO",
 6875                    "description": "Inheritance for a trio family",
 6876                    "available": True,
 6877                    "function_name": "calculation_trio",
 6878                    "function_params": [],
 6879                },
 6880                "VAF": {
 6881                    "type": "python",
 6882                    "name": "VAF",
 6883                    "description": "Variant Allele Frequency (VAF) harmonization",
 6884                    "available": True,
 6885                    "function_name": "calculation_vaf_normalization",
 6886                    "function_params": [],
 6887                },
 6888                "VAF_stats": {
 6889                    "type": "python",
 6890                    "name": "VAF_stats",
 6891                    "description": "Variant Allele Frequency (VAF) statistics",
 6892                    "available": True,
 6893                    "function_name": "calculation_genotype_stats",
 6894                    "function_params": ["VAF"],
 6895                },
 6896                "DP_stats": {
 6897                    "type": "python",
 6898                    "name": "DP_stats",
 6899                    "description": "Depth (DP) statistics",
 6900                    "available": True,
 6901                    "function_name": "calculation_genotype_stats",
 6902                    "function_params": ["DP"],
 6903                },
 6904                "variant_id": {
 6905                    "type": "python",
 6906                    "name": "variant_id",
 6907                    "description": "Variant ID generated from variant position and type",
 6908                    "available": True,
 6909                    "function_name": "calculation_variant_id",
 6910                    "function_params": [],
 6911                },
 6912                "transcripts_json": {
 6913                    "type": "python",
 6914                    "name": "transcripts_json",
 6915                    "description": "Add transcripts annotations in JSON format (field 'transcripts_json')",
 6916                    "available": True,
 6917                    "function_name": "calculation_transcripts_annotation",
 6918                    "function_params": ["transcripts_json", None],
 6919                },
 6920                "transcripts_ann": {
 6921                    "type": "python",
 6922                    "name": "transcripts_ann",
 6923                    "description": "Add transcripts annotations in structured format (field 'transcripts_ann')",
 6924                    "available": True,
 6925                    "function_name": "calculation_transcripts_annotation",
 6926                    "function_params": [None, "transcripts_ann"],
 6927                },
 6928                "transcripts_annotations": {
 6929                    "type": "python",
 6930                    "name": "transcripts_annotations",
 6931                    "description": "Add transcripts annotations in JSON and/or structured format (see param JSON file)",
 6932                    "available": True,
 6933                    "function_name": "calculation_transcripts_annotation",
 6934                    "function_params": [None, None],
 6935                },
 6936                "transcripts_prioritization": {
 6937                    "type": "python",
 6938                    "name": "transcripts_prioritization",
 6939                    "description": "Prioritize transcripts with a prioritization profile (using param.json)",
 6940                    "available": True,
 6941                    "function_name": "calculation_transcripts_prioritization",
 6942                    "function_params": [],
 6943                },
 6944                "transcripts_export": {
 6945                    "type": "python",
 6946                    "name": "transcripts_export",
 6947                    "description": "Export transcripts table/view as a file (using param.json)",
 6948                    "available": True,
 6949                    "function_name": "calculation_transcripts_export",
 6950                    "function_params": [],
 6951                },
 6952            },
 6953            "prioritizations": {
 6954                "default": {
 6955                    "ANN2": [
 6956                        {
 6957                            "type": "contains",
 6958                            "value": "HIGH",
 6959                            "score": 5,
 6960                            "flag": "PASS",
 6961                            "comment": [
 6962                                "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay"
 6963                            ],
 6964                        },
 6965                        {
 6966                            "type": "contains",
 6967                            "value": "MODERATE",
 6968                            "score": 3,
 6969                            "flag": "PASS",
 6970                            "comment": [
 6971                                "A non-disruptive variant that might change protein effectiveness"
 6972                            ],
 6973                        },
 6974                        {
 6975                            "type": "contains",
 6976                            "value": "LOW",
 6977                            "score": 0,
 6978                            "flag": "FILTERED",
 6979                            "comment": [
 6980                                "Assumed to be mostly harmless or unlikely to change protein behavior"
 6981                            ],
 6982                        },
 6983                        {
 6984                            "type": "contains",
 6985                            "value": "MODIFIER",
 6986                            "score": 0,
 6987                            "flag": "FILTERED",
 6988                            "comment": [
 6989                                "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact"
 6990                            ],
 6991                        },
 6992                    ],
 6993                }
 6994            },
 6995        }
 6996
 6997        return config_default.get(name, None)
 6998
 6999    def get_config_json(
 7000        self, name: str, config_dict: dict = {}, config_file: str = None
 7001    ) -> dict:
 7002        """
 7003        The function `get_config_json` retrieves a configuration JSON object with prioritizations from
 7004        default values, a dictionary, and a file.
 7005
 7006        :param name: The `name` parameter in the `get_config_json` function is a string that represents
 7007        the name of the configuration. It is used to identify and retrieve the configuration settings
 7008        for a specific component or module
 7009        :type name: str
 7010        :param config_dict: The `config_dict` parameter in the `get_config_json` function is a
 7011        dictionary that allows you to provide additional configuration settings or overrides. When you
 7012        call the `get_config_json` function, you can pass a dictionary containing key-value pairs where
 7013        the key is the configuration setting you want to override or
 7014        :type config_dict: dict
 7015        :param config_file: The `config_file` parameter in the `get_config_json` function is used to
 7016        specify the path to a configuration file that contains additional settings. If provided, the
 7017        function will read the contents of this file and update the configuration dictionary with the
 7018        values found in the file, overriding any existing values with the
 7019        :type config_file: str
 7020        :return: The function `get_config_json` returns a dictionary containing the configuration
 7021        settings.
 7022        """
 7023
 7024        # Create with default prioritizations
 7025        config_default = self.get_config_default(name=name)
 7026        configuration = config_default
 7027        # log.debug(f"configuration={configuration}")
 7028
 7029        # Replace prioritizations from dict
 7030        for config in config_dict:
 7031            configuration[config] = config_dict[config]
 7032
 7033        # Replace prioritizations from file
 7034        config_file = full_path(config_file)
 7035        if config_file:
 7036            if os.path.exists(config_file):
 7037                with open(config_file) as config_file_content:
 7038                    config_file_dict = json.load(config_file_content)
 7039                for config in config_file_dict:
 7040                    configuration[config] = config_file_dict[config]
 7041            else:
 7042                msg_error = f"Config '{name}' file '{config_file}' does NOT exist"
 7043                log.error(msg_error)
 7044                raise ValueError(msg_error)
 7045
 7046        return configuration
 7047
 7048    def prioritization(
 7049        self, table: str = None, pz_prefix: str = None, pz_param: dict = None
 7050    ) -> bool:
 7051        """
 7052        The `prioritization` function in Python processes VCF files, adds new INFO fields, and
 7053        prioritizes variants based on configured profiles and criteria.
 7054
 7055        :param table: The `table` parameter in the `prioritization` function is used to specify the name
 7056        of the table (presumably a VCF file) on which the prioritization operation will be performed. If
 7057        a table name is provided, the method will prioritize the variants in that specific table
 7058        :type table: str
 7059        :param pz_prefix: The `pz_prefix` parameter is used to specify a prefix that will be added to
 7060        certain INFO fields in a VCF file during the prioritization process. If this parameter is not
 7061        provided, the code will use a default prefix value of "PZ"
 7062        :type pz_prefix: str
 7063        :param pz_param: The `pz_param` parameter in the `prioritization` method is used to pass
 7064        additional parameters specific to the prioritization process. These parameters can include
 7065        settings related to prioritization profiles, fields, scoring modes, flags, comments, and other
 7066        configurations needed for the prioritization of variants in a V
 7067        :type pz_param: dict
 7068        :return: A boolean value (True) is being returned from the `prioritization` function.
 7069        """
 7070
 7071        # Config
 7072        config = self.get_config()
 7073
 7074        # Param
 7075        param = self.get_param()
 7076
 7077        # Prioritization param
 7078        if pz_param is not None:
 7079            prioritization_param = pz_param
 7080        else:
 7081            prioritization_param = param.get("prioritization", {})
 7082
 7083        # Configuration profiles
 7084        prioritization_config_file = prioritization_param.get(
 7085            "prioritization_config", None
 7086        )
 7087        prioritization_config_file = full_path(prioritization_config_file)
 7088        prioritizations_config = self.get_config_json(
 7089            name="prioritizations", config_file=prioritization_config_file
 7090        )
 7091
 7092        # Prioritization prefix
 7093        pz_prefix_default = "PZ"
 7094        if pz_prefix is None:
 7095            pz_prefix = prioritization_param.get("pzprefix", pz_prefix_default)
 7096
 7097        # Prioritization options
 7098        profiles = prioritization_param.get("profiles", [])
 7099        if isinstance(profiles, str):
 7100            profiles = profiles.split(",")
 7101        pzfields = prioritization_param.get(
 7102            "pzfields", [f"{pz_prefix}Flag", f"{pz_prefix}Score"]
 7103        )
 7104        if isinstance(pzfields, str):
 7105            pzfields = pzfields.split(",")
 7106        default_profile = prioritization_param.get("default_profile", None)
 7107        pzfields_sep = prioritization_param.get("pzfields_sep", "_")
 7108        prioritization_score_mode = prioritization_param.get(
 7109            "prioritization_score_mode", "HOWARD"
 7110        )
 7111
 7112        # Quick Prioritizations
 7113        prioritizations = param.get("prioritizations", None)
 7114        if prioritizations:
 7115            log.info("Quick Prioritization:")
 7116            for profile in prioritizations.split(","):
 7117                if profile not in profiles:
 7118                    profiles.append(profile)
 7119                    log.info(f"   {profile}")
 7120
 7121        # If profile "ALL" provided, all profiles in the config profiles
 7122        if "ALL" in profiles:
 7123            profiles = list(prioritizations_config.keys())
 7124
 7125        for profile in profiles:
 7126            if prioritizations_config.get(profile, None):
 7127                log.debug(f"Profile '{profile}' configured")
 7128            else:
 7129                msg_error = f"Profile '{profile}' NOT configured"
 7130                log.error(msg_error)
 7131                raise ValueError(msg_error)
 7132
 7133        if profiles:
 7134            log.info(f"Prioritization... ")
 7135        else:
 7136            log.debug(f"No profile defined")
 7137            return False
 7138
 7139        if not default_profile and len(profiles):
 7140            default_profile = profiles[0]
 7141
 7142        log.debug("Profiles availables: " + str(list(prioritizations_config.keys())))
 7143        log.debug("Profiles to check: " + str(list(profiles)))
 7144
 7145        # Variables
 7146        if table is not None:
 7147            table_variants = table
 7148        else:
 7149            table_variants = self.get_table_variants(clause="update")
 7150        log.debug(f"Table to prioritize: {table_variants}")
 7151
 7152        # Added columns
 7153        added_columns = []
 7154
 7155        # Create list of PZfields
 7156        # List of PZFields
 7157        list_of_pzfields_original = pzfields + [
 7158            pzfield + pzfields_sep + profile
 7159            for pzfield in pzfields
 7160            for profile in profiles
 7161        ]
 7162        list_of_pzfields = []
 7163        log.debug(f"{list_of_pzfields_original}")
 7164
 7165        # Remove existing PZfields to use if exists
 7166        for pzfield in list_of_pzfields_original:
 7167            if self.get_header().infos.get(pzfield, None) is None:
 7168                list_of_pzfields.append(pzfield)
 7169                log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF")
 7170            else:
 7171                log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF")
 7172
 7173        if list_of_pzfields:
 7174
 7175            # Explode Infos prefix
 7176            explode_infos_prefix = self.get_explode_infos_prefix()
 7177
 7178            # PZfields tags description
 7179            PZfields_INFOS = {
 7180                f"{pz_prefix}Tags": {
 7181                    "ID": f"{pz_prefix}Tags",
 7182                    "Number": ".",
 7183                    "Type": "String",
 7184                    "Description": "Variant tags based on annotation criteria",
 7185                },
 7186                f"{pz_prefix}Score": {
 7187                    "ID": f"{pz_prefix}Score",
 7188                    "Number": 1,
 7189                    "Type": "Integer",
 7190                    "Description": "Variant score based on annotation criteria",
 7191                },
 7192                f"{pz_prefix}Flag": {
 7193                    "ID": f"{pz_prefix}Flag",
 7194                    "Number": 1,
 7195                    "Type": "String",
 7196                    "Description": "Variant flag based on annotation criteria",
 7197                },
 7198                f"{pz_prefix}Comment": {
 7199                    "ID": f"{pz_prefix}Comment",
 7200                    "Number": ".",
 7201                    "Type": "String",
 7202                    "Description": "Variant comment based on annotation criteria",
 7203                },
 7204                f"{pz_prefix}Infos": {
 7205                    "ID": f"{pz_prefix}Infos",
 7206                    "Number": ".",
 7207                    "Type": "String",
 7208                    "Description": "Variant infos based on annotation criteria",
 7209                },
 7210                f"{pz_prefix}Class": {
 7211                    "ID": f"{pz_prefix}Class",
 7212                    "Number": ".",
 7213                    "Type": "String",
 7214                    "Description": "Variant class based on annotation criteria",
 7215                },
 7216            }
 7217
 7218            # Create INFO fields if not exist
 7219            for field in PZfields_INFOS:
 7220                field_ID = PZfields_INFOS[field]["ID"]
 7221                field_description = PZfields_INFOS[field]["Description"]
 7222                if field_ID not in self.get_header().infos and field_ID in pzfields:
 7223                    field_description = (
 7224                        PZfields_INFOS[field]["Description"]
 7225                        + f", profile {default_profile}"
 7226                    )
 7227                    self.get_header().infos[field_ID] = vcf.parser._Info(
 7228                        field_ID,
 7229                        PZfields_INFOS[field]["Number"],
 7230                        PZfields_INFOS[field]["Type"],
 7231                        field_description,
 7232                        "unknown",
 7233                        "unknown",
 7234                        code_type_map[PZfields_INFOS[field]["Type"]],
 7235                    )
 7236
 7237            # Create INFO fields if not exist for each profile
 7238            for profile in prioritizations_config:
 7239                if profile in profiles or profiles == []:
 7240                    for field in PZfields_INFOS:
 7241                        field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile
 7242                        field_description = (
 7243                            PZfields_INFOS[field]["Description"]
 7244                            + f", profile {profile}"
 7245                        )
 7246                        if (
 7247                            field_ID not in self.get_header().infos
 7248                            and field in pzfields
 7249                        ):
 7250                            self.get_header().infos[field_ID] = vcf.parser._Info(
 7251                                field_ID,
 7252                                PZfields_INFOS[field]["Number"],
 7253                                PZfields_INFOS[field]["Type"],
 7254                                field_description,
 7255                                "unknown",
 7256                                "unknown",
 7257                                code_type_map[PZfields_INFOS[field]["Type"]],
 7258                            )
 7259
 7260            # Header
 7261            for pzfield in list_of_pzfields:
 7262                if re.match(f"{pz_prefix}Score.*", pzfield):
 7263                    added_column = self.add_column(
 7264                        table_name=table_variants,
 7265                        column_name=pzfield,
 7266                        column_type="INTEGER",
 7267                        default_value="0",
 7268                    )
 7269                elif re.match(f"{pz_prefix}Flag.*", pzfield):
 7270                    added_column = self.add_column(
 7271                        table_name=table_variants,
 7272                        column_name=pzfield,
 7273                        column_type="BOOLEAN",
 7274                        default_value="1",
 7275                    )
 7276                elif re.match(f"{pz_prefix}Class.*", pzfield):
 7277                    added_column = self.add_column(
 7278                        table_name=table_variants,
 7279                        column_name=pzfield,
 7280                        column_type="VARCHAR[]",
 7281                        default_value="null",
 7282                    )
 7283                else:
 7284                    added_column = self.add_column(
 7285                        table_name=table_variants,
 7286                        column_name=pzfield,
 7287                        column_type="STRING",
 7288                        default_value="''",
 7289                    )
 7290                added_columns.append(added_column)
 7291
 7292            # Profiles
 7293            if profiles:
 7294
 7295                # foreach profile in configuration file
 7296                for profile in prioritizations_config:
 7297
 7298                    # If profile is asked in param, or ALL are asked (empty profile [])
 7299                    if profile in profiles or profiles == []:
 7300                        log.info(f"Profile '{profile}'")
 7301
 7302                        sql_set_info_option = ""
 7303
 7304                        sql_set_info = []
 7305
 7306                        # PZ fields set
 7307
 7308                        # PZScore
 7309                        if (
 7310                            f"{pz_prefix}Score{pzfields_sep}{profile}"
 7311                            in list_of_pzfields
 7312                        ):
 7313                            sql_set_info.append(
 7314                                f"""
 7315                                    concat(
 7316                                        '{pz_prefix}Score{pzfields_sep}{profile}=',
 7317                                        {pz_prefix}Score{pzfields_sep}{profile}
 7318                                    ) 
 7319                                """
 7320                            )
 7321                            if (
 7322                                profile == default_profile
 7323                                and f"{pz_prefix}Score" in list_of_pzfields
 7324                            ):
 7325                                sql_set_info.append(
 7326                                    f"""
 7327                                        concat(
 7328                                            '{pz_prefix}Score=',
 7329                                            {pz_prefix}Score{pzfields_sep}{profile}
 7330                                        )
 7331                                    """
 7332                                )
 7333
 7334                        # PZFlag
 7335                        if (
 7336                            f"{pz_prefix}Flag{pzfields_sep}{profile}"
 7337                            in list_of_pzfields
 7338                        ):
 7339                            sql_set_info.append(
 7340                                f"""
 7341                                    concat(
 7342                                        '{pz_prefix}Flag{pzfields_sep}{profile}=',
 7343                                        CASE 
 7344                                            WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1
 7345                                            THEN 'PASS'
 7346                                            WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0
 7347                                            THEN 'FILTERED'
 7348                                        END
 7349                                    ) 
 7350                                """
 7351                            )
 7352                            if (
 7353                                profile == default_profile
 7354                                and f"{pz_prefix}Flag" in list_of_pzfields
 7355                            ):
 7356                                sql_set_info.append(
 7357                                    f"""
 7358                                        concat(
 7359                                            '{pz_prefix}Flag=',
 7360                                            CASE 
 7361                                                WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1
 7362                                                THEN 'PASS'
 7363                                                WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0
 7364                                                THEN 'FILTERED'
 7365                                            END
 7366                                        )
 7367                                    """
 7368                                )
 7369
 7370                        # PZClass
 7371                        if (
 7372                            f"{pz_prefix}Class{pzfields_sep}{profile}"
 7373                            in list_of_pzfields
 7374                        ):
 7375                            sql_set_info.append(
 7376                                f"""
 7377                                    concat(
 7378                                        '{pz_prefix}Class{pzfields_sep}{profile}=',
 7379                                        CASE
 7380                                            WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
 7381                                            THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
 7382                                            ELSE '.'
 7383                                        END 
 7384                                    )
 7385                                    
 7386                                """
 7387                            )
 7388                            if (
 7389                                profile == default_profile
 7390                                and f"{pz_prefix}Class" in list_of_pzfields
 7391                            ):
 7392                                sql_set_info.append(
 7393                                    f"""
 7394                                        concat(
 7395                                            '{pz_prefix}Class=',
 7396                                            CASE
 7397                                                WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
 7398                                                THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
 7399                                                ELSE '.'
 7400                                            END 
 7401                                        )
 7402                                    """
 7403                                )
 7404
 7405                        # PZComment
 7406                        if (
 7407                            f"{pz_prefix}Comment{pzfields_sep}{profile}"
 7408                            in list_of_pzfields
 7409                        ):
 7410                            sql_set_info.append(
 7411                                f"""
 7412                                    CASE
 7413                                        WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('')
 7414                                        THEN concat('{pz_prefix}Comment{pzfields_sep}{profile}=', {pz_prefix}Comment{pzfields_sep}{profile})
 7415                                        ELSE ''
 7416                                    END
 7417                                """
 7418                            )
 7419                            if (
 7420                                profile == default_profile
 7421                                and f"{pz_prefix}Comment" in list_of_pzfields
 7422                            ):
 7423                                sql_set_info.append(
 7424                                    f"""
 7425                                        CASE
 7426                                            WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('')
 7427                                            THEN concat('{pz_prefix}Comment=', {pz_prefix}Comment{pzfields_sep}{profile})
 7428                                            ELSE ''
 7429                                        END
 7430                                    """
 7431                                )
 7432
 7433                        # PZInfos
 7434                        if (
 7435                            f"{pz_prefix}Infos{pzfields_sep}{profile}"
 7436                            in list_of_pzfields
 7437                        ):
 7438                            sql_set_info.append(
 7439                                f"""
 7440                                    CASE
 7441                                        WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('')
 7442                                        THEN concat('{pz_prefix}Infos{pzfields_sep}{profile}=', {pz_prefix}Infos{pzfields_sep}{profile})
 7443                                        ELSE ''
 7444                                    END
 7445                                """
 7446                            )
 7447                            if (
 7448                                profile == default_profile
 7449                                and f"{pz_prefix}Infos" in list_of_pzfields
 7450                            ):
 7451                                sql_set_info.append(
 7452                                    f"""
 7453                                        CASE
 7454                                            WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('')
 7455                                            THEN concat('{pz_prefix}Infos=', {pz_prefix}Infos{pzfields_sep}{profile})
 7456                                            ELSE ''
 7457                                        END
 7458                                    """
 7459                                )
 7460
 7461                        # Merge PZfields
 7462                        sql_set_info_option = ""
 7463                        sql_set_sep = ""
 7464                        for sql_set in sql_set_info:
 7465                            if sql_set_sep:
 7466                                sql_set_info_option += f"""
 7467                                    , concat('{sql_set_sep}', {sql_set})
 7468                                """
 7469                            else:
 7470                                sql_set_info_option += f"""
 7471                                    , {sql_set}
 7472                                """
 7473                            sql_set_sep = ";"
 7474
 7475                        sql_queries = []
 7476                        for annotation in prioritizations_config[profile]:
 7477
 7478                            # skip special sections
 7479                            if annotation.startswith("_"):
 7480                                continue
 7481
 7482                            # For each criterions
 7483                            for criterion in prioritizations_config[profile][
 7484                                annotation
 7485                            ]:
 7486
 7487                                # Criterion mode
 7488                                criterion_mode = None
 7489                                if np.any(
 7490                                    np.isin(list(criterion.keys()), ["type", "value"])
 7491                                ):
 7492                                    criterion_mode = "operation"
 7493                                elif np.any(
 7494                                    np.isin(list(criterion.keys()), ["sql", "fields"])
 7495                                ):
 7496                                    criterion_mode = "sql"
 7497                                log.debug(f"Criterion Mode: {criterion_mode}")
 7498
 7499                                # Criterion parameters
 7500                                criterion_type = criterion.get("type", None)
 7501                                criterion_value = criterion.get("value", None)
 7502                                criterion_sql = criterion.get("sql", None)
 7503                                criterion_fields = criterion.get("fields", None)
 7504                                criterion_score = criterion.get("score", 0)
 7505                                criterion_flag = criterion.get("flag", "PASS")
 7506                                criterion_class = criterion.get("class", None)
 7507                                criterion_flag_bool = criterion_flag == "PASS"
 7508                                criterion_comment = (
 7509                                    ", ".join(criterion.get("comment", []))
 7510                                    .replace("'", "''")
 7511                                    .replace(";", ",")
 7512                                    .replace("\t", " ")
 7513                                )
 7514                                criterion_infos = (
 7515                                    str(criterion)
 7516                                    .replace("'", "''")
 7517                                    .replace(";", ",")
 7518                                    .replace("\t", " ")
 7519                                )
 7520
 7521                                # SQL
 7522                                if criterion_sql is not None and isinstance(
 7523                                    criterion_sql, list
 7524                                ):
 7525                                    criterion_sql = " ".join(criterion_sql)
 7526
 7527                                # Fields and explode
 7528                                if criterion_fields is None:
 7529                                    criterion_fields = [annotation]
 7530                                if not isinstance(criterion_fields, list):
 7531                                    criterion_fields = str(criterion_fields).split(",")
 7532
 7533                                # Class
 7534                                if criterion_class is not None and not isinstance(
 7535                                    criterion_class, list
 7536                                ):
 7537                                    criterion_class = str(criterion_class).split(",")
 7538
 7539                                for annotation_field in criterion_fields:
 7540
 7541                                    # Explode specific annotation
 7542                                    log.debug(
 7543                                        f"Explode annotation '{annotation_field}'"
 7544                                    )
 7545                                    added_columns += self.explode_infos(
 7546                                        prefix=explode_infos_prefix,
 7547                                        fields=[annotation_field],
 7548                                        table=table_variants,
 7549                                    )
 7550                                    extra_infos = self.get_extra_infos(
 7551                                        table=table_variants
 7552                                    )
 7553
 7554                                    # Check if annotation field is present
 7555                                    if (
 7556                                        f"{explode_infos_prefix}{annotation_field}"
 7557                                        not in extra_infos
 7558                                    ):
 7559                                        msq_err = f"Annotation '{annotation_field}' not in data"
 7560                                        log.error(msq_err)
 7561                                        raise ValueError(msq_err)
 7562                                    else:
 7563                                        log.debug(
 7564                                            f"Annotation '{annotation_field}' in data"
 7565                                        )
 7566
 7567                                sql_set = []
 7568                                sql_set_info = []
 7569
 7570                                # PZ fields set
 7571
 7572                                # PZScore
 7573                                if (
 7574                                    f"{pz_prefix}Score{pzfields_sep}{profile}"
 7575                                    in list_of_pzfields
 7576                                ):
 7577                                    # if prioritization_score_mode == "HOWARD":
 7578                                    #     sql_set.append(
 7579                                    #         f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}"
 7580                                    #     )
 7581                                    # VaRank prioritization score mode
 7582                                    if prioritization_score_mode == "VaRank":
 7583                                        sql_set.append(
 7584                                            f"{pz_prefix}Score{pzfields_sep}{profile} = CASE WHEN {criterion_score}>{pz_prefix}Score{pzfields_sep}{profile} THEN {criterion_score} END"
 7585                                        )
 7586                                    # default HOWARD prioritization score mode
 7587                                    else:
 7588                                        sql_set.append(
 7589                                            f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}"
 7590                                        )
 7591
 7592                                # PZFlag
 7593                                if (
 7594                                    f"{pz_prefix}Flag{pzfields_sep}{profile}"
 7595                                    in list_of_pzfields
 7596                                ):
 7597                                    sql_set.append(
 7598                                        f"{pz_prefix}Flag{pzfields_sep}{profile} = {pz_prefix}Flag{pzfields_sep}{profile} AND {criterion_flag_bool}"
 7599                                    )
 7600
 7601                                # PZClass
 7602                                if (
 7603                                    f"{pz_prefix}Class{pzfields_sep}{profile}"
 7604                                    in list_of_pzfields
 7605                                    and criterion_class is not None
 7606                                ):
 7607                                    sql_set.append(
 7608                                        f" {pz_prefix}Class{pzfields_sep}{profile} = list_concat(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), {criterion_class}) "
 7609                                    )
 7610
 7611                                # PZComment
 7612                                if (
 7613                                    f"{pz_prefix}Comment{pzfields_sep}{profile}"
 7614                                    in list_of_pzfields
 7615                                ):
 7616                                    sql_set.append(
 7617                                        f"""
 7618                                            {pz_prefix}Comment{pzfields_sep}{profile} = 
 7619                                                concat(
 7620                                                    {pz_prefix}Comment{pzfields_sep}{profile},
 7621                                                    CASE 
 7622                                                        WHEN {pz_prefix}Comment{pzfields_sep}{profile}!=''
 7623                                                        THEN ', '
 7624                                                        ELSE ''
 7625                                                    END,
 7626                                                    '{criterion_comment}'
 7627                                                )
 7628                                        """
 7629                                    )
 7630
 7631                                # PZInfos
 7632                                if (
 7633                                    f"{pz_prefix}Infos{pzfields_sep}{profile}"
 7634                                    in list_of_pzfields
 7635                                ):
 7636                                    sql_set.append(
 7637                                        f"""
 7638                                            {pz_prefix}Infos{pzfields_sep}{profile} = 
 7639                                                concat(
 7640                                                    {pz_prefix}Infos{pzfields_sep}{profile},
 7641                                                    '{criterion_infos}'
 7642                                                )
 7643                                        """
 7644                                    )
 7645                                sql_set_option = ",".join(sql_set)
 7646
 7647                                # Criterion and comparison
 7648                                if sql_set_option:
 7649
 7650                                    if criterion_mode in ["operation"]:
 7651
 7652                                        try:
 7653                                            float(criterion_value)
 7654                                            sql_update = f"""
 7655                                                UPDATE {table_variants}
 7656                                                SET {sql_set_option}
 7657                                                WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.')
 7658                                                AND CAST("{explode_infos_prefix}{annotation}" AS FLOAT){comparison_map[criterion_type]}{criterion_value}
 7659                                            """
 7660                                        except:
 7661                                            contains_option = ""
 7662                                            if criterion_type == "contains":
 7663                                                contains_option = ".*"
 7664                                            sql_update = f"""
 7665                                                UPDATE {table_variants}
 7666                                                SET {sql_set_option}
 7667                                                WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}'
 7668                                            """
 7669                                        sql_queries.append(sql_update)
 7670
 7671                                    elif criterion_mode in ["sql"]:
 7672
 7673                                        sql_update = f"""
 7674                                            UPDATE {table_variants}
 7675                                            SET {sql_set_option}
 7676                                            WHERE {criterion_sql}
 7677                                        """
 7678                                        sql_queries.append(sql_update)
 7679
 7680                                    else:
 7681                                        msg_err = f"Prioritization criterion mode failed (either 'operation' or 'sql')"
 7682                                        log.error(msg_err)
 7683                                        raise ValueError(msg_err)
 7684
 7685                                else:
 7686                                    log.warning(
 7687                                        f"NO SQL SET option for '{annotation}' - '{criterion}'"
 7688                                    )
 7689
 7690                        # PZTags
 7691                        if (
 7692                            f"{pz_prefix}Tags{pzfields_sep}{profile}"
 7693                            in list_of_pzfields
 7694                        ):
 7695
 7696                            # Create PZFalgs value
 7697                            pztags_value = ""
 7698                            pztags_sep_default = ","
 7699                            pztags_sep = ""
 7700                            for pzfield in pzfields:
 7701                                if pzfield not in [f"{pz_prefix}Tags"]:
 7702                                    if (
 7703                                        f"{pzfield}{pzfields_sep}{profile}"
 7704                                        in list_of_pzfields
 7705                                    ):
 7706                                        if pzfield in [f"{pz_prefix}Flag"]:
 7707                                            pztags_value += f"""{pztags_sep}{pzfield}#', 
 7708                                                CASE WHEN {pz_prefix}Flag{pzfields_sep}{profile}
 7709                                                    THEN 'PASS'
 7710                                                    ELSE 'FILTERED'
 7711                                                END, '"""
 7712                                        elif pzfield in [f"{pz_prefix}Class"]:
 7713                                            pztags_value += f"""{pztags_sep}{pzfield}#', 
 7714                                                CASE WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
 7715                                                    THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
 7716                                                    ELSE '.'
 7717                                                END, '"""
 7718                                        else:
 7719                                            pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '"
 7720                                        pztags_sep = pztags_sep_default
 7721
 7722                            # Add Query update for PZFlags
 7723                            sql_update_pztags = f"""
 7724                                UPDATE {table_variants}
 7725                                SET INFO = concat(
 7726                                        INFO,
 7727                                        CASE WHEN INFO NOT in ('','.')
 7728                                                THEN ';'
 7729                                                ELSE ''
 7730                                        END,
 7731                                        '{pz_prefix}Tags{pzfields_sep}{profile}={pztags_value}'
 7732                                    )
 7733                                """
 7734                            sql_queries.append(sql_update_pztags)
 7735
 7736                            # Add Query update for PZFlags for default
 7737                            if profile == default_profile:
 7738                                sql_update_pztags_default = f"""
 7739                                UPDATE {table_variants}
 7740                                SET INFO = concat(
 7741                                        INFO,
 7742                                        ';',
 7743                                        '{pz_prefix}Tags={pztags_value}'
 7744                                    )
 7745                                """
 7746                                sql_queries.append(sql_update_pztags_default)
 7747
 7748                        log.info(f"""Profile '{profile}' - Prioritization... """)
 7749
 7750                        if sql_queries:
 7751
 7752                            for sql_query in sql_queries:
 7753                                log.debug(
 7754                                    f"""Profile '{profile}' - Prioritization query: {sql_query}... """
 7755                                )
 7756                                self.conn.execute(sql_query)
 7757
 7758                        log.info(f"""Profile '{profile}' - Update... """)
 7759                        sql_query_update = f"""
 7760                            UPDATE {table_variants}
 7761                            SET INFO =  
 7762                                concat(
 7763                                    CASE
 7764                                        WHEN INFO NOT IN ('','.')
 7765                                        THEN concat(INFO, ';')
 7766                                        ELSE ''
 7767                                    END
 7768                                    {sql_set_info_option}
 7769                                )
 7770                        """
 7771                        self.conn.execute(sql_query_update)
 7772
 7773        else:
 7774
 7775            log.warning(f"No profiles in parameters")
 7776
 7777        # Remove added columns
 7778        for added_column in added_columns:
 7779            self.drop_column(column=added_column)
 7780
 7781        # Explode INFOS fields into table fields
 7782        if self.get_explode_infos():
 7783            self.explode_infos(
 7784                prefix=self.get_explode_infos_prefix(),
 7785                fields=self.get_explode_infos_fields(),
 7786                force=True,
 7787            )
 7788
 7789        return True
 7790
 7791    ###
 7792    # HGVS
 7793    ###
 7794
 7795    def annotation_hgvs(self, threads: int = None) -> None:
 7796        """
 7797        The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic
 7798        coordinates and alleles.
 7799
 7800        :param threads: The `threads` parameter is an optional integer that specifies the number of
 7801        threads to use for parallel processing. If no value is provided, it will default to the number
 7802        of threads obtained from the `get_threads()` method
 7803        :type threads: int
 7804        """
 7805
 7806        # Function for each partition of the Dask Dataframe
 7807        def partition_function(partition):
 7808            """
 7809            The function `partition_function` applies the `annotation_hgvs_partition` function to
 7810            each row of a DataFrame called `partition`.
 7811
 7812            :param partition: The parameter "partition" is a pandas DataFrame that contains the data
 7813            to be processed
 7814            :return: the result of applying the "annotation_hgvs_partition" function to each row of
 7815            the "partition" dataframe along the axis 1.
 7816            """
 7817            return partition.apply(annotation_hgvs_partition, axis=1)
 7818
 7819        def annotation_hgvs_partition(row) -> str:
 7820            """
 7821            The function `annotation_hgvs_partition` takes in a row of data and returns a string
 7822            containing a list of HGVS names associated with the given genomic coordinates and alleles.
 7823
 7824            :param row: A dictionary-like object that contains the values for the following keys:
 7825            :return: a string that contains the HGVS names associated with the given row of data.
 7826            """
 7827
 7828            chr = row["CHROM"]
 7829            pos = row["POS"]
 7830            ref = row["REF"]
 7831            alt = row["ALT"]
 7832
 7833            # Find list of associated transcripts
 7834            transcripts_list = list(
 7835                polars_conn.execute(
 7836                    f"""
 7837                SELECT transcript
 7838                FROM refseq_df
 7839                WHERE CHROM='{chr}'
 7840                AND POS={pos}
 7841            """
 7842                )["transcript"]
 7843            )
 7844
 7845            # Full HGVS annotation in list
 7846            hgvs_full_list = []
 7847
 7848            for transcript_name in transcripts_list:
 7849
 7850                # Transcript
 7851                transcript = get_transcript(
 7852                    transcripts=transcripts, transcript_name=transcript_name
 7853                )
 7854                # Exon
 7855                if use_exon:
 7856                    exon = transcript.find_exon_number(pos)
 7857                else:
 7858                    exon = None
 7859                # Protein
 7860                transcript_protein = None
 7861                if use_protein or add_protein or full_format:
 7862                    transcripts_protein = list(
 7863                        polars_conn.execute(
 7864                            f"""
 7865                        SELECT protein
 7866                        FROM refseqlink_df
 7867                        WHERE transcript='{transcript_name}'
 7868                        LIMIT 1
 7869                    """
 7870                        )["protein"]
 7871                    )
 7872                    if len(transcripts_protein):
 7873                        transcript_protein = transcripts_protein[0]
 7874
 7875                # HGVS name
 7876                hgvs_name = format_hgvs_name(
 7877                    chr,
 7878                    pos,
 7879                    ref,
 7880                    alt,
 7881                    genome=genome,
 7882                    transcript=transcript,
 7883                    transcript_protein=transcript_protein,
 7884                    exon=exon,
 7885                    use_gene=use_gene,
 7886                    use_protein=use_protein,
 7887                    full_format=full_format,
 7888                    use_version=use_version,
 7889                    codon_type=codon_type,
 7890                )
 7891                hgvs_full_list.append(hgvs_name)
 7892                if add_protein and not use_protein and not full_format:
 7893                    hgvs_name = format_hgvs_name(
 7894                        chr,
 7895                        pos,
 7896                        ref,
 7897                        alt,
 7898                        genome=genome,
 7899                        transcript=transcript,
 7900                        transcript_protein=transcript_protein,
 7901                        exon=exon,
 7902                        use_gene=use_gene,
 7903                        use_protein=True,
 7904                        full_format=False,
 7905                        use_version=use_version,
 7906                        codon_type=codon_type,
 7907                    )
 7908                    hgvs_full_list.append(hgvs_name)
 7909
 7910            # Create liste of HGVS annotations
 7911            hgvs_full = ",".join(hgvs_full_list)
 7912
 7913            return hgvs_full
 7914
 7915        # Polars connexion
 7916        polars_conn = pl.SQLContext(register_globals=True, eager=True)
 7917
 7918        # Config
 7919        config = self.get_config()
 7920
 7921        # Databases
 7922        # Genome
 7923        databases_genomes_folders = (
 7924            config.get("folders", {})
 7925            .get("databases", {})
 7926            .get("genomes", DEFAULT_GENOME_FOLDER)
 7927        )
 7928        databases_genome = (
 7929            config.get("folders", {}).get("databases", {}).get("genomes", "")
 7930        )
 7931        # refseq database folder
 7932        databases_refseq_folders = (
 7933            config.get("folders", {})
 7934            .get("databases", {})
 7935            .get("refseq", DEFAULT_REFSEQ_FOLDER)
 7936        )
 7937        # refseq
 7938        databases_refseq = config.get("databases", {}).get("refSeq", None)
 7939        # refSeqLink
 7940        databases_refseqlink = config.get("databases", {}).get("refSeqLink", None)
 7941
 7942        # Param
 7943        param = self.get_param()
 7944
 7945        # Quick HGVS
 7946        if "hgvs_options" in param and param.get("hgvs_options", ""):
 7947            log.info(f"Quick HGVS Annotation:")
 7948            if not param.get("hgvs", None):
 7949                param["hgvs"] = {}
 7950            for option in param.get("hgvs_options", "").split(","):
 7951                option_var_val = option.split("=")
 7952                option_var = option_var_val[0]
 7953                if len(option_var_val) > 1:
 7954                    option_val = option_var_val[1]
 7955                else:
 7956                    option_val = "True"
 7957                if option_val.upper() in ["TRUE"]:
 7958                    option_val = True
 7959                elif option_val.upper() in ["FALSE"]:
 7960                    option_val = False
 7961                log.info(f"   {option_var}={option_val}")
 7962                param["hgvs"][option_var] = option_val
 7963
 7964        # Check if HGVS annotation enabled
 7965        if "hgvs" in param:
 7966            log.info(f"HGVS Annotation... ")
 7967            for hgvs_option in param.get("hgvs", {}):
 7968                log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}")
 7969        else:
 7970            return
 7971
 7972        # HGVS Param
 7973        param_hgvs = param.get("hgvs", {})
 7974        use_exon = param_hgvs.get("use_exon", False)
 7975        use_gene = param_hgvs.get("use_gene", False)
 7976        use_protein = param_hgvs.get("use_protein", False)
 7977        add_protein = param_hgvs.get("add_protein", False)
 7978        full_format = param_hgvs.get("full_format", False)
 7979        use_version = param_hgvs.get("use_version", False)
 7980        codon_type = param_hgvs.get("codon_type", "3")
 7981
 7982        # refSseq refSeqLink
 7983        databases_refseq = param_hgvs.get("refseq", databases_refseq)
 7984        databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink)
 7985
 7986        # Assembly
 7987        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 7988
 7989        # Genome
 7990        genome_file = None
 7991        if find_genome(databases_genome):
 7992            genome_file = find_genome(databases_genome)
 7993        else:
 7994            genome_file = find_genome(
 7995                genome_path=databases_genomes_folders, assembly=assembly
 7996            )
 7997        log.debug("Genome: " + str(genome_file))
 7998
 7999        # refSseq
 8000        refseq_file = find_file_prefix(
 8001            input_file=databases_refseq,
 8002            prefix="ncbiRefSeq",
 8003            folder=databases_refseq_folders,
 8004            assembly=assembly,
 8005        )
 8006        log.debug("refSeq: " + str(refseq_file))
 8007
 8008        # refSeqLink
 8009        refseqlink_file = find_file_prefix(
 8010            input_file=databases_refseqlink,
 8011            prefix="ncbiRefSeqLink",
 8012            folder=databases_refseq_folders,
 8013            assembly=assembly,
 8014        )
 8015        log.debug("refSeqLink: " + str(refseqlink_file))
 8016
 8017        # Threads
 8018        if not threads:
 8019            threads = self.get_threads()
 8020        log.debug("Threads: " + str(threads))
 8021
 8022        # Variables
 8023        table_variants = self.get_table_variants(clause="update")
 8024
 8025        # Get variants SNV and InDel only
 8026        query_variants = f"""
 8027            SELECT "#CHROM" AS CHROM, POS, REF, ALT
 8028            FROM {table_variants}
 8029            WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$'
 8030            """
 8031        df_variants = self.get_query_to_df(query_variants)
 8032
 8033        # Added columns
 8034        added_columns = []
 8035
 8036        # Add hgvs column in variants table
 8037        hgvs_column_name = "hgvs_" + str(random.randrange(1000))
 8038        added_column = self.add_column(
 8039            table_variants, hgvs_column_name, "STRING", default_value=None
 8040        )
 8041        added_columns.append(added_column)
 8042
 8043        log.debug(f"refSeq loading...")
 8044        # refSeq in duckDB
 8045        refseq_table = get_refseq_table(
 8046            conn=self.conn, refseq_table="refseq", refseq_file=refseq_file
 8047        )
 8048        # Loading all refSeq in Dataframe
 8049        refseq_query = f"""
 8050            SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript
 8051            FROM {refseq_table}
 8052            JOIN df_variants ON (
 8053                {refseq_table}.chrom = df_variants.CHROM
 8054                AND {refseq_table}.txStart<=df_variants.POS
 8055                AND {refseq_table}.txEnd>=df_variants.POS
 8056            )
 8057        """
 8058        refseq_df = self.conn.query(refseq_query).pl()
 8059
 8060        if refseqlink_file:
 8061            log.debug(f"refSeqLink loading...")
 8062            # refSeqLink in duckDB
 8063            refseqlink_table = get_refseq_table(
 8064                conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file
 8065            )
 8066            # Loading all refSeqLink in Dataframe
 8067            protacc_column = "protAcc_with_ver"
 8068            mrnaacc_column = "mrnaAcc_with_ver"
 8069            refseqlink_query = f"""
 8070                SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript
 8071                FROM {refseqlink_table} 
 8072                JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver)
 8073                WHERE protAcc_without_ver IS NOT NULL
 8074            """
 8075            # Polars Dataframe
 8076            refseqlink_df = self.conn.query(f"{refseqlink_query}").pl()
 8077
 8078        # Read RefSeq transcripts into a python dict/model.
 8079        log.debug(f"Transcripts loading...")
 8080        with tempfile.TemporaryDirectory() as tmpdir:
 8081            transcripts_query = f"""
 8082                COPY (
 8083                    SELECT {refseq_table}.*
 8084                    FROM {refseq_table}
 8085                    JOIN df_variants ON (
 8086                        {refseq_table}.chrom=df_variants.CHROM
 8087                        AND {refseq_table}.txStart<=df_variants.POS
 8088                        AND {refseq_table}.txEnd>=df_variants.POS
 8089                    )
 8090                )
 8091                TO '{tmpdir}/transcript.tsv' (DELIMITER '\t');
 8092            """
 8093            self.conn.query(transcripts_query)
 8094            with open(f"{tmpdir}/transcript.tsv") as infile:
 8095                transcripts = read_transcripts(infile)
 8096
 8097        # Polars connexion
 8098        polars_conn = pl.SQLContext(register_globals=True, eager=True)
 8099
 8100        log.debug("Genome loading...")
 8101        # Read genome sequence using pyfaidx.
 8102        genome = Fasta(genome_file)
 8103
 8104        log.debug("Start annotation HGVS...")
 8105
 8106        # Create
 8107        # a Dask Dataframe from Pandas dataframe with partition as number of threads
 8108        ddf = dd.from_pandas(df_variants, npartitions=threads)
 8109
 8110        # Use dask.dataframe.apply() to apply function on each partition
 8111        ddf[hgvs_column_name] = ddf.map_partitions(partition_function)
 8112
 8113        # Convert Dask DataFrame to Pandas Dataframe
 8114        df = ddf.compute()
 8115
 8116        # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???)
 8117        with tempfile.TemporaryDirectory() as tmpdir:
 8118            df_parquet = os.path.join(tmpdir, "df.parquet")
 8119            df.to_parquet(df_parquet)
 8120
 8121            # Update hgvs column
 8122            update_variant_query = f"""
 8123                UPDATE {table_variants}
 8124                SET "{hgvs_column_name}"=df."{hgvs_column_name}"
 8125                FROM read_parquet('{df_parquet}') as df
 8126                WHERE variants."#CHROM" = df.CHROM
 8127                AND variants.POS = df.POS
 8128                AND variants.REF = df.REF
 8129                AND variants.ALT = df.ALT
 8130                AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL
 8131                """
 8132            self.execute_query(update_variant_query)
 8133
 8134        # Update INFO column
 8135        sql_query_update = f"""
 8136            UPDATE {table_variants}
 8137            SET INFO = 
 8138                concat(
 8139                    CASE 
 8140                        WHEN INFO NOT IN ('','.')
 8141                        THEN concat(INFO, ';')
 8142                        ELSE ''
 8143                    END,
 8144                    'hgvs=',
 8145                    {hgvs_column_name}
 8146                )
 8147            WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL
 8148            """
 8149        self.execute_query(sql_query_update)
 8150
 8151        # Add header
 8152        HGVS_INFOS = {
 8153            "hgvs": {
 8154                "ID": "hgvs",
 8155                "Number": ".",
 8156                "Type": "String",
 8157                "Description": f"HGVS annotatation with HOWARD",
 8158            }
 8159        }
 8160
 8161        for field in HGVS_INFOS:
 8162            field_ID = HGVS_INFOS[field]["ID"]
 8163            field_description = HGVS_INFOS[field]["Description"]
 8164            self.get_header().infos[field_ID] = vcf.parser._Info(
 8165                field_ID,
 8166                HGVS_INFOS[field]["Number"],
 8167                HGVS_INFOS[field]["Type"],
 8168                field_description,
 8169                "unknown",
 8170                "unknown",
 8171                code_type_map[HGVS_INFOS[field]["Type"]],
 8172            )
 8173
 8174        # Remove added columns
 8175        for added_column in added_columns:
 8176            self.drop_column(column=added_column)
 8177
 8178    ###
 8179    # Calculation
 8180    ###
 8181
 8182    def get_operations_help(
 8183        self, operations_config_dict: dict = {}, operations_config_file: str = None
 8184    ) -> list:
 8185
 8186        # Init
 8187        operations_help = []
 8188
 8189        # operations
 8190        operations = self.get_config_json(
 8191            name="calculations",
 8192            config_dict=operations_config_dict,
 8193            config_file=operations_config_file,
 8194        )
 8195        for op in operations:
 8196            op_name = operations[op].get("name", op).upper()
 8197            op_description = operations[op].get("description", op_name)
 8198            op_available = operations[op].get("available", False)
 8199            if op_available:
 8200                operations_help.append(f"   {op_name}: {op_description}")
 8201
 8202        # Sort operations
 8203        operations_help.sort()
 8204
 8205        # insert header
 8206        operations_help.insert(0, "Available calculation operations:")
 8207
 8208        # Return
 8209        return operations_help
 8210
 8211    def calculation(
 8212        self,
 8213        operations: dict = {},
 8214        operations_config_dict: dict = {},
 8215        operations_config_file: str = None,
 8216    ) -> None:
 8217        """
 8218        It takes a list of operations, and for each operation, it checks if it's a python or sql
 8219        operation, and then calls the appropriate function
 8220
 8221        param json example:
 8222            "calculation": {
 8223                "NOMEN": {
 8224                    "options": {
 8225                        "hgvs_field": "hgvs"
 8226                    },
 8227                "middle" : null
 8228            }
 8229        """
 8230
 8231        # Param
 8232        param = self.get_param()
 8233
 8234        # operations config
 8235        operations_config = self.get_config_json(
 8236            name="calculations",
 8237            config_dict=operations_config_dict,
 8238            config_file=operations_config_file,
 8239        )
 8240
 8241        # Upper keys
 8242        operations_config = {k.upper(): v for k, v in operations_config.items()}
 8243
 8244        # Calculations
 8245
 8246        # Operations from param
 8247        operations = param.get("calculation", {}).get("calculations", operations)
 8248
 8249        # Quick calculation - add
 8250        if param.get("calculations", None):
 8251
 8252            # List of operations
 8253            calculations_list = [
 8254                value.strip() for value in param.get("calculations", "").split(",")
 8255            ]
 8256
 8257            # Log
 8258            log.info(f"Quick Calculations:")
 8259            for calculation_key in calculations_list:
 8260                log.info(f"   {calculation_key}")
 8261
 8262            # Create tmp operations (to keep operation order)
 8263            operations_tmp = {}
 8264            for calculation_operation in calculations_list:
 8265                if calculation_operation.upper() not in operations_tmp:
 8266                    log.debug(
 8267                        f"{calculation_operation}.upper() not in {operations_tmp}"
 8268                    )
 8269                    operations_tmp[calculation_operation.upper()] = {}
 8270                    add_value_into_dict(
 8271                        dict_tree=operations_tmp,
 8272                        sections=[
 8273                            calculation_operation.upper(),
 8274                        ],
 8275                        value=operations.get(calculation_operation.upper(), {}),
 8276                    )
 8277            # Add operations already in param
 8278            for calculation_operation in operations:
 8279                if calculation_operation not in operations_tmp:
 8280                    operations_tmp[calculation_operation] = operations.get(
 8281                        calculation_operation, {}
 8282                    )
 8283
 8284            # Update operations in param
 8285            operations = operations_tmp
 8286
 8287        # Operations for calculation
 8288        if not operations:
 8289            operations = param.get("calculation", {}).get("calculations", {})
 8290
 8291        if operations:
 8292            log.info(f"Calculations...")
 8293
 8294        # For each operations
 8295        for operation_name in operations:
 8296            operation_name = operation_name.upper()
 8297            if operation_name not in [""]:
 8298                if operation_name in operations_config:
 8299                    log.info(f"Calculation '{operation_name}'")
 8300                    operation = operations_config[operation_name]
 8301                    operation_type = operation.get("type", "sql")
 8302                    if operation_type == "python":
 8303                        self.calculation_process_function(
 8304                            operation=operation, operation_name=operation_name
 8305                        )
 8306                    elif operation_type == "sql":
 8307                        self.calculation_process_sql(
 8308                            operation=operation, operation_name=operation_name
 8309                        )
 8310                    else:
 8311                        log.error(
 8312                            f"Operations config: Type '{operation_type}' NOT available"
 8313                        )
 8314                        raise ValueError(
 8315                            f"Operations config: Type '{operation_type}' NOT available"
 8316                        )
 8317                else:
 8318                    log.error(
 8319                        f"Operations config: Calculation '{operation_name}' NOT available"
 8320                    )
 8321                    raise ValueError(
 8322                        f"Operations config: Calculation '{operation_name}' NOT available"
 8323                    )
 8324
 8325        # Explode INFOS fields into table fields
 8326        if self.get_explode_infos():
 8327            self.explode_infos(
 8328                prefix=self.get_explode_infos_prefix(),
 8329                fields=self.get_explode_infos_fields(),
 8330                force=True,
 8331            )
 8332
 8333    def calculation_process_sql(
 8334        self, operation: dict, operation_name: str = "unknown"
 8335    ) -> None:
 8336        """
 8337        The `calculation_process_sql` function takes in a mathematical operation as a string and
 8338        performs the operation, updating the specified table with the result.
 8339
 8340        :param operation: The `operation` parameter is a dictionary that contains information about the
 8341        mathematical operation to be performed. It includes the following keys:
 8342        :type operation: dict
 8343        :param operation_name: The `operation_name` parameter is a string that represents the name of
 8344        the mathematical operation being performed. It is used for logging and error handling purposes,
 8345        defaults to unknown
 8346        :type operation_name: str (optional)
 8347        """
 8348
 8349        # Operation infos
 8350        operation_name = operation.get("name", "unknown")
 8351        log.debug(f"process sql {operation_name}")
 8352        output_column_name = operation.get("output_column_name", operation_name)
 8353        output_column_type = operation.get("output_column_type", "String")
 8354        prefix = operation.get("explode_infos_prefix", "")
 8355        output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR")
 8356        output_column_description = operation.get(
 8357            "output_column_description", f"{operation_name} operation"
 8358        )
 8359        operation_query = operation.get("operation_query", None)
 8360        if isinstance(operation_query, list):
 8361            operation_query = " ".join(operation_query)
 8362        operation_info_fields = operation.get("info_fields", [])
 8363        operation_info_fields_check = operation.get("info_fields_check", False)
 8364        operation_info = operation.get("operation_info", True)
 8365        operation_table = operation.get(
 8366            "table", self.get_table_variants(clause="alter")
 8367        )
 8368
 8369        # table variants
 8370        if operation_table:
 8371            table_variants = operation_table
 8372        else:
 8373            table_variants = self.get_table_variants(clause="alter")
 8374
 8375        if operation_query:
 8376
 8377            # Info fields check
 8378            operation_info_fields_check_result = True
 8379            if operation_info_fields_check:
 8380                header_infos = self.get_header().infos
 8381                for info_field in operation_info_fields:
 8382                    operation_info_fields_check_result = (
 8383                        operation_info_fields_check_result
 8384                        and info_field in header_infos
 8385                    )
 8386
 8387            # If info fields available
 8388            if operation_info_fields_check_result:
 8389
 8390                # Added_columns
 8391                added_columns = []
 8392
 8393                # Create VCF header field
 8394                vcf_reader = self.get_header()
 8395                vcf_reader.infos[output_column_name] = vcf.parser._Info(
 8396                    output_column_name,
 8397                    ".",
 8398                    output_column_type,
 8399                    output_column_description,
 8400                    "howard calculation",
 8401                    "0",
 8402                    self.code_type_map.get(output_column_type),
 8403                )
 8404
 8405                # Explode infos if needed
 8406                log.debug(f"calculation_process_sql prefix {prefix}")
 8407                added_columns += self.explode_infos(
 8408                    prefix=prefix,
 8409                    fields=[output_column_name] + operation_info_fields,
 8410                    force=False,
 8411                    table=table_variants,
 8412                )
 8413
 8414                # Create column
 8415                added_column = self.add_column(
 8416                    table_name=table_variants,
 8417                    column_name=prefix + output_column_name,
 8418                    column_type=output_column_type_sql,
 8419                    default_value="null",
 8420                )
 8421                added_columns.append(added_column)
 8422
 8423                # Operation calculation
 8424                try:
 8425
 8426                    # Query to update calculation column
 8427                    sql_update = f"""
 8428                        UPDATE {table_variants}
 8429                        SET "{prefix}{output_column_name}" = ({operation_query})
 8430                    """
 8431                    self.conn.execute(sql_update)
 8432
 8433                    # Add to INFO
 8434                    if operation_info:
 8435                        sql_update_info = f"""
 8436                            UPDATE {table_variants}
 8437                            SET "INFO" =
 8438                                concat(
 8439                                    CASE
 8440                                        WHEN "INFO" IS NOT NULL
 8441                                        THEN concat("INFO", ';')
 8442                                        ELSE ''
 8443                                    END,
 8444                                    '{output_column_name}=',
 8445                                    "{prefix}{output_column_name}"
 8446                                )
 8447                            WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('')
 8448                        """
 8449                        self.conn.execute(sql_update_info)
 8450
 8451                except:
 8452                    log.error(
 8453                        f"Operations config: Calculation '{operation_name}' query failed"
 8454                    )
 8455                    raise ValueError(
 8456                        f"Operations config: Calculation '{operation_name}' query failed"
 8457                    )
 8458
 8459                # Remove added columns
 8460                for added_column in added_columns:
 8461                    log.debug(f"added_column: {added_column}")
 8462                    self.drop_column(column=added_column)
 8463
 8464            else:
 8465                log.error(
 8466                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
 8467                )
 8468                raise ValueError(
 8469                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
 8470                )
 8471
 8472        else:
 8473            log.error(
 8474                f"Operations config: Calculation '{operation_name}' query NOT defined"
 8475            )
 8476            raise ValueError(
 8477                f"Operations config: Calculation '{operation_name}' query NOT defined"
 8478            )
 8479
 8480    def calculation_process_function(
 8481        self, operation: dict, operation_name: str = "unknown"
 8482    ) -> None:
 8483        """
 8484        The `calculation_process_function` takes in an operation dictionary and performs the specified
 8485        function with the given parameters.
 8486
 8487        :param operation: The `operation` parameter is a dictionary that contains information about the
 8488        operation to be performed. It has the following keys:
 8489        :type operation: dict
 8490        :param operation_name: The `operation_name` parameter is a string that represents the name of
 8491        the operation being performed. It is used for logging purposes, defaults to unknown
 8492        :type operation_name: str (optional)
 8493        """
 8494
 8495        operation_name = operation["name"]
 8496        log.debug(f"process sql {operation_name}")
 8497        function_name = operation["function_name"]
 8498        function_params = operation["function_params"]
 8499        getattr(self, function_name)(*function_params)
 8500
 8501    def calculation_variant_id(self) -> None:
 8502        """
 8503        The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and
 8504        updates the INFO field of a variants table with the variant ID.
 8505        """
 8506
 8507        # variant_id annotation field
 8508        variant_id_tag = self.get_variant_id_column()
 8509        added_columns = [variant_id_tag]
 8510
 8511        # variant_id hgvs tags"
 8512        vcf_infos_tags = {
 8513            variant_id_tag: "howard variant ID annotation",
 8514        }
 8515
 8516        # Variants table
 8517        table_variants = self.get_table_variants()
 8518
 8519        # Header
 8520        vcf_reader = self.get_header()
 8521
 8522        # Add variant_id to header
 8523        vcf_reader.infos[variant_id_tag] = vcf.parser._Info(
 8524            variant_id_tag,
 8525            ".",
 8526            "String",
 8527            vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"),
 8528            "howard calculation",
 8529            "0",
 8530            self.code_type_map.get("String"),
 8531        )
 8532
 8533        # Update
 8534        sql_update = f"""
 8535            UPDATE {table_variants}
 8536            SET "INFO" = 
 8537                concat(
 8538                    CASE
 8539                        WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8540                        THEN ''
 8541                        ELSE concat("INFO", ';')
 8542                    END,
 8543                    '{variant_id_tag}=',
 8544                    "{variant_id_tag}"
 8545                )
 8546        """
 8547        self.conn.execute(sql_update)
 8548
 8549        # Remove added columns
 8550        for added_column in added_columns:
 8551            self.drop_column(column=added_column)
 8552
 8553    def calculation_extract_snpeff_hgvs(
 8554        self,
 8555        snpeff_hgvs: str = "snpeff_hgvs",
 8556        snpeff_field: str = "ANN",
 8557    ) -> None:
 8558        """
 8559        The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff
 8560        annotation field in a VCF file and adds them as a new column in the variants table.
 8561
 8562        :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs`
 8563        function is used to specify the name of the column that will store the HGVS nomenclatures
 8564        extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to
 8565        snpeff_hgvs
 8566        :type snpeff_hgvs: str (optional)
 8567        :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs`
 8568        function represents the field in the VCF file that contains SnpEff annotations. This field is
 8569        used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults
 8570        to ANN
 8571        :type snpeff_field: str (optional)
 8572        """
 8573
 8574        # Snpeff hgvs tags
 8575        vcf_infos_tags = {
 8576            snpeff_hgvs: "HGVS nomenclatures from snpEff annotation",
 8577        }
 8578
 8579        # Prefix
 8580        prefix = self.get_explode_infos_prefix()
 8581        if prefix:
 8582            prefix = "INFO/"
 8583
 8584        # snpEff fields
 8585        speff_ann_infos = prefix + snpeff_field
 8586        speff_hgvs_infos = prefix + snpeff_hgvs
 8587
 8588        # Variants table
 8589        table_variants = self.get_table_variants()
 8590
 8591        # Header
 8592        vcf_reader = self.get_header()
 8593
 8594        # Add columns
 8595        added_columns = []
 8596
 8597        # Explode HGVS field in column
 8598        added_columns += self.explode_infos(fields=[snpeff_field])
 8599
 8600        if snpeff_field in vcf_reader.infos:
 8601
 8602            log.debug(vcf_reader.infos[snpeff_field])
 8603
 8604            # Extract ANN header
 8605            ann_description = vcf_reader.infos[snpeff_field].desc
 8606            pattern = r"'(.+?)'"
 8607            match = re.search(pattern, ann_description)
 8608            if match:
 8609                ann_header_match = match.group(1).split(" | ")
 8610                ann_header_desc = {}
 8611                for i in range(len(ann_header_match)):
 8612                    ann_header_info = "".join(
 8613                        char for char in ann_header_match[i] if char.isalnum()
 8614                    )
 8615                    ann_header_desc[ann_header_info] = ann_header_match[i]
 8616                if not ann_header_desc:
 8617                    raise ValueError("Invalid header description format")
 8618            else:
 8619                raise ValueError("Invalid header description format")
 8620
 8621            # Create variant id
 8622            variant_id_column = self.get_variant_id_column()
 8623            added_columns += [variant_id_column]
 8624
 8625            # Create dataframe
 8626            dataframe_snpeff_hgvs = self.get_query_to_df(
 8627                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
 8628            )
 8629
 8630            # Create main NOMEN column
 8631            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
 8632                speff_ann_infos
 8633            ].apply(
 8634                lambda x: extract_snpeff_hgvs(
 8635                    str(x), header=list(ann_header_desc.values())
 8636                )
 8637            )
 8638
 8639            # Add snpeff_hgvs to header
 8640            vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info(
 8641                snpeff_hgvs,
 8642                ".",
 8643                "String",
 8644                vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"),
 8645                "howard calculation",
 8646                "0",
 8647                self.code_type_map.get("String"),
 8648            )
 8649
 8650            # Update
 8651            sql_update = f"""
 8652                UPDATE variants
 8653                SET "INFO" = 
 8654                    concat(
 8655                        CASE
 8656                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8657                            THEN ''
 8658                            ELSE concat("INFO", ';')
 8659                        END,
 8660                        CASE 
 8661                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
 8662                            AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
 8663                            THEN concat(
 8664                                    '{snpeff_hgvs}=',
 8665                                    dataframe_snpeff_hgvs."{speff_hgvs_infos}"
 8666                                )
 8667                            ELSE ''
 8668                        END
 8669                    )
 8670                FROM dataframe_snpeff_hgvs
 8671                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
 8672
 8673            """
 8674            self.conn.execute(sql_update)
 8675
 8676            # Delete dataframe
 8677            del dataframe_snpeff_hgvs
 8678            gc.collect()
 8679
 8680        else:
 8681
 8682            log.warning(
 8683                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
 8684            )
 8685
 8686        # Remove added columns
 8687        for added_column in added_columns:
 8688            self.drop_column(column=added_column)
 8689
 8690    def calculation_snpeff_ann_explode(
 8691        self,
 8692        uniquify: bool = True,
 8693        output_format: str = "fields",
 8694        output_prefix: str = "snpeff_",
 8695        snpeff_field: str = "ANN",
 8696    ) -> None:
 8697        """
 8698        The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by
 8699        exploding the HGVS field and updating variant information accordingly.
 8700
 8701        :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a
 8702        boolean flag that determines whether the output should be uniquified or not. When set to `True`,
 8703        it indicates that the output should be unique, meaning that duplicate entries should be removed,
 8704        defaults to True
 8705        :type uniquify: bool (optional)
 8706        :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode`
 8707        function specifies the format in which the output annotations will be generated. It has a
 8708        default value of "fields". You can also set it to "JSON" to output the annotations in JSON
 8709        format, defaults to fields
 8710        :type output_format: str (optional)
 8711        :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode`
 8712        method is used to specify the prefix that will be added to the output annotations generated
 8713        during the calculation process. This prefix helps to differentiate the newly added annotations
 8714        from existing ones in the output data. By default, the, defaults to ANN_
 8715        :type output_prefix: str (optional)
 8716        :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode`
 8717        function is used to specify the field in the VCF file that contains SnpEff annotations. This
 8718        field will be processed to explode the HGVS annotations and update the variant information
 8719        accordingly, defaults to ANN
 8720        :type snpeff_field: str (optional)
 8721        """
 8722
 8723        # SnpEff annotation field
 8724        snpeff_hgvs = "snpeff_ann_explode"
 8725
 8726        # Snpeff hgvs tags
 8727        vcf_infos_tags = {
 8728            snpeff_hgvs: "Explode snpEff annotations",
 8729        }
 8730
 8731        # Prefix
 8732        prefix = self.get_explode_infos_prefix()
 8733        if prefix:
 8734            prefix = "INFO/"
 8735
 8736        # snpEff fields
 8737        speff_ann_infos = prefix + snpeff_field
 8738        speff_hgvs_infos = prefix + snpeff_hgvs
 8739
 8740        # Variants table
 8741        table_variants = self.get_table_variants()
 8742
 8743        # Header
 8744        vcf_reader = self.get_header()
 8745
 8746        # Add columns
 8747        added_columns = []
 8748
 8749        # Explode HGVS field in column
 8750        added_columns += self.explode_infos(fields=[snpeff_field])
 8751        log.debug(f"snpeff_field={snpeff_field}")
 8752        log.debug(f"added_columns={added_columns}")
 8753
 8754        if snpeff_field in vcf_reader.infos:
 8755
 8756            # Extract ANN header
 8757            ann_description = vcf_reader.infos[snpeff_field].desc
 8758            pattern = r"'(.+?)'"
 8759            match = re.search(pattern, ann_description)
 8760            if match:
 8761                ann_header_match = match.group(1).split(" | ")
 8762                ann_header = []
 8763                ann_header_desc = {}
 8764                for i in range(len(ann_header_match)):
 8765                    ann_header_info = "".join(
 8766                        char for char in ann_header_match[i] if char.isalnum()
 8767                    )
 8768                    ann_header.append(ann_header_info)
 8769                    ann_header_desc[ann_header_info] = ann_header_match[i]
 8770                if not ann_header_desc:
 8771                    raise ValueError("Invalid header description format")
 8772            else:
 8773                raise ValueError("Invalid header description format")
 8774
 8775            # Create variant id
 8776            variant_id_column = self.get_variant_id_column()
 8777            added_columns += [variant_id_column]
 8778
 8779            # Create dataframe
 8780            dataframe_snpeff_hgvs = self.get_query_to_df(
 8781                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
 8782            )
 8783
 8784            # Create snpEff columns
 8785            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
 8786                speff_ann_infos
 8787            ].apply(
 8788                lambda x: explode_snpeff_ann(
 8789                    str(x),
 8790                    uniquify=uniquify,
 8791                    output_format=output_format,
 8792                    prefix=output_prefix,
 8793                    header=list(ann_header_desc.values()),
 8794                )
 8795            )
 8796
 8797            # Header
 8798            ann_annotations_prefix = ""
 8799            if output_format.upper() in ["JSON"]:
 8800                ann_annotations_prefix = f"{output_prefix}="
 8801                vcf_reader.infos[output_prefix] = vcf.parser._Info(
 8802                    output_prefix,
 8803                    ".",
 8804                    "String",
 8805                    vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
 8806                    + " - JSON format",
 8807                    "howard calculation",
 8808                    "0",
 8809                    self.code_type_map.get("String"),
 8810                )
 8811            else:
 8812                for ann_annotation in ann_header:
 8813                    ann_annotation_id = f"{output_prefix}{ann_annotation}"
 8814                    vcf_reader.infos[ann_annotation_id] = vcf.parser._Info(
 8815                        ann_annotation_id,
 8816                        ".",
 8817                        "String",
 8818                        vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
 8819                        + f" - '{ann_header_desc[ann_annotation]}' annotation",
 8820                        "howard calculation",
 8821                        "0",
 8822                        self.code_type_map.get("String"),
 8823                    )
 8824
 8825            # Update
 8826            sql_update = f"""
 8827                UPDATE variants
 8828                SET "INFO" = 
 8829                    concat(
 8830                        CASE
 8831                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8832                            THEN ''
 8833                            ELSE concat("INFO", ';')
 8834                        END,
 8835                        CASE 
 8836                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
 8837                                AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
 8838                            THEN concat(
 8839                                '{ann_annotations_prefix}',
 8840                                dataframe_snpeff_hgvs."{speff_hgvs_infos}"
 8841                                )
 8842                            ELSE ''
 8843                        END
 8844                    )
 8845                FROM dataframe_snpeff_hgvs
 8846                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
 8847
 8848            """
 8849            self.conn.execute(sql_update)
 8850
 8851            # Delete dataframe
 8852            del dataframe_snpeff_hgvs
 8853            gc.collect()
 8854
 8855        else:
 8856
 8857            log.warning(
 8858                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
 8859            )
 8860
 8861        # Remove added columns
 8862        for added_column in added_columns:
 8863            self.drop_column(column=added_column)
 8864
 8865    def calculation_extract_nomen(self) -> None:
 8866        """
 8867        This function extracts the HGVS nomenclature from the calculation/identification of NOMEN.
 8868        """
 8869
 8870        # NOMEN field
 8871        field_nomen_dict = "NOMEN_DICT"
 8872
 8873        # NOMEN structure
 8874        nomen_dict = {
 8875            "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)",
 8876            "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)",
 8877            "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)",
 8878            "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant",
 8879            "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)",
 8880            "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)",
 8881            "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)",
 8882            "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)",
 8883            "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)",
 8884            "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)",
 8885        }
 8886
 8887        # Param
 8888        param = self.get_param()
 8889
 8890        # Prefix
 8891        prefix = self.get_explode_infos_prefix()
 8892
 8893        # Header
 8894        vcf_reader = self.get_header()
 8895
 8896        # Added columns
 8897        added_columns = []
 8898
 8899        # Get HGVS field
 8900        hgvs_field = (
 8901            param.get("calculation", {})
 8902            .get("calculations", {})
 8903            .get("NOMEN", {})
 8904            .get("options", {})
 8905            .get("hgvs_field", "hgvs")
 8906        )
 8907
 8908        # Get NOMEN pattern
 8909        nomen_pattern = (
 8910            param.get("calculation", {})
 8911            .get("calculations", {})
 8912            .get("NOMEN", {})
 8913            .get("options", {})
 8914            .get("pattern", None)
 8915        )
 8916
 8917        # transcripts list of preference sources
 8918        transcripts_sources = {}
 8919
 8920        # Get transcripts
 8921        transcripts_file = (
 8922            param.get("calculation", {})
 8923            .get("calculations", {})
 8924            .get("NOMEN", {})
 8925            .get("options", {})
 8926            .get("transcripts", None)
 8927        )
 8928        transcripts_file = full_path(transcripts_file)
 8929        if transcripts_file:
 8930            if os.path.exists(transcripts_file):
 8931                transcripts_dataframe = transcripts_file_to_df(transcripts_file)
 8932                transcripts_from_file = transcripts_dataframe.iloc[:, 0].tolist()
 8933                transcripts_sources["file"] = transcripts_from_file
 8934            else:
 8935                msg_err = f"Transcript file '{transcripts_file}' does NOT exist"
 8936                log.error(msg_err)
 8937                raise ValueError(msg_err)
 8938
 8939        # Get transcripts table
 8940        transcripts_table = (
 8941            param.get("calculation", {})
 8942            .get("calculations", {})
 8943            .get("NOMEN", {})
 8944            .get("options", {})
 8945            .get("transcripts_table", self.get_table_variants())
 8946        )
 8947        # Get transcripts column
 8948        transcripts_column = (
 8949            param.get("calculation", {})
 8950            .get("calculations", {})
 8951            .get("NOMEN", {})
 8952            .get("options", {})
 8953            .get("transcripts_column", None)
 8954        )
 8955
 8956        if transcripts_table and transcripts_column:
 8957            extra_field_transcript = f"{transcripts_table}.{transcripts_column}"
 8958            # Explode if not exists
 8959            self.explode_infos(fields=[transcripts_column], table=transcripts_table)
 8960        else:
 8961            extra_field_transcript = f"NULL"
 8962
 8963        # Transcripts of preference source order
 8964        transcripts_order = (
 8965            param.get("calculation", {})
 8966            .get("calculations", {})
 8967            .get("NOMEN", {})
 8968            .get("options", {})
 8969            .get("transcripts_order", ["column", "file"])
 8970        )
 8971
 8972        # Transcripts from file
 8973        transcripts = transcripts_sources.get("file", [])
 8974
 8975        # Explode HGVS field in column
 8976        added_columns += self.explode_infos(fields=[hgvs_field])
 8977
 8978        # extra infos
 8979        extra_infos = self.get_extra_infos()
 8980        extra_field = prefix + hgvs_field
 8981
 8982        if extra_field in extra_infos:
 8983
 8984            # Create dataframe
 8985            dataframe_hgvs = self.get_query_to_df(
 8986                f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" AS hgvs, {extra_field_transcript} AS transcript FROM variants """
 8987            )
 8988
 8989            # Create main NOMEN column
 8990            dataframe_hgvs[field_nomen_dict] = dataframe_hgvs.apply(
 8991                lambda x: find_nomen(
 8992                    hgvs=x.hgvs,
 8993                    transcript=x.transcript,
 8994                    transcripts=transcripts,
 8995                    pattern=nomen_pattern,
 8996                    transcripts_source_order=transcripts_order,
 8997                ),
 8998                axis=1,
 8999            )
 9000
 9001            # Explode NOMEN Structure and create SQL set for update
 9002            sql_nomen_fields = []
 9003            for nomen_field in nomen_dict:
 9004
 9005                # Explode each field into a column
 9006                dataframe_hgvs[nomen_field] = dataframe_hgvs[field_nomen_dict].apply(
 9007                    lambda x: dict(x).get(nomen_field, "")
 9008                )
 9009
 9010                # Create VCF header field
 9011                vcf_reader.infos[nomen_field] = vcf.parser._Info(
 9012                    nomen_field,
 9013                    ".",
 9014                    "String",
 9015                    nomen_dict.get(nomen_field, "howard calculation NOMEN"),
 9016                    "howard calculation",
 9017                    "0",
 9018                    self.code_type_map.get("String"),
 9019                )
 9020                sql_nomen_fields.append(
 9021                    f"""
 9022                        CASE 
 9023                            WHEN dataframe_hgvs."{nomen_field}" NOT NULL AND dataframe_hgvs."{nomen_field}" NOT IN ('')
 9024                            THEN concat(
 9025                                    ';{nomen_field}=',
 9026                                    dataframe_hgvs."{nomen_field}"
 9027                                )
 9028                            ELSE ''
 9029                        END
 9030                    """
 9031                )
 9032
 9033            # SQL set for update
 9034            sql_nomen_fields_set = ", ".join(sql_nomen_fields)
 9035
 9036            # Update
 9037            sql_update = f"""
 9038                UPDATE variants
 9039                SET "INFO" = 
 9040                    concat(
 9041                        CASE
 9042                            WHEN "INFO" IS NULL
 9043                            THEN ''
 9044                            ELSE "INFO"
 9045                        END,
 9046                        {sql_nomen_fields_set}
 9047                    )
 9048                FROM dataframe_hgvs
 9049                WHERE variants."#CHROM" = dataframe_hgvs."#CHROM"
 9050                    AND variants."POS" = dataframe_hgvs."POS" 
 9051                    AND variants."REF" = dataframe_hgvs."REF"
 9052                    AND variants."ALT" = dataframe_hgvs."ALT"
 9053            """
 9054            self.conn.execute(sql_update)
 9055
 9056            # Delete dataframe
 9057            del dataframe_hgvs
 9058            gc.collect()
 9059
 9060        # Remove added columns
 9061        for added_column in added_columns:
 9062            self.drop_column(column=added_column)
 9063
 9064    def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None:
 9065        """
 9066        The function `calculation_find_by_pipeline` performs a calculation to find the number of
 9067        pipeline/sample for a variant and updates the variant information in a VCF file.
 9068
 9069        :param tag: The `tag` parameter is a string that represents the annotation field for the
 9070        "findbypipeline" information in the VCF file. It is used to create the annotation field in the
 9071        VCF header and to update the corresponding field in the variants table, defaults to
 9072        findbypipeline
 9073        :type tag: str (optional)
 9074        """
 9075
 9076        # if FORMAT and samples
 9077        if (
 9078            "FORMAT" in self.get_header_columns_as_list()
 9079            and self.get_header_sample_list()
 9080        ):
 9081
 9082            # findbypipeline annotation field
 9083            findbypipeline_tag = tag
 9084
 9085            # VCF infos tags
 9086            vcf_infos_tags = {
 9087                findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})",
 9088            }
 9089
 9090            # Prefix
 9091            prefix = self.get_explode_infos_prefix()
 9092
 9093            # Field
 9094            findbypipeline_infos = prefix + findbypipeline_tag
 9095
 9096            # Variants table
 9097            table_variants = self.get_table_variants()
 9098
 9099            # Header
 9100            vcf_reader = self.get_header()
 9101
 9102            # Create variant id
 9103            variant_id_column = self.get_variant_id_column()
 9104            added_columns = [variant_id_column]
 9105
 9106            # variant_id, FORMAT and samples
 9107            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9108                self.get_header_sample_list()
 9109            )
 9110
 9111            # Create dataframe
 9112            dataframe_findbypipeline = self.get_query_to_df(
 9113                f""" SELECT {samples_fields} FROM {table_variants} """
 9114            )
 9115
 9116            # Create findbypipeline column
 9117            dataframe_findbypipeline[findbypipeline_infos] = (
 9118                dataframe_findbypipeline.apply(
 9119                    lambda row: findbypipeline(
 9120                        row, samples=self.get_header_sample_list()
 9121                    ),
 9122                    axis=1,
 9123                )
 9124            )
 9125
 9126            # Add snpeff_hgvs to header
 9127            vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info(
 9128                findbypipeline_tag,
 9129                ".",
 9130                "String",
 9131                vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"),
 9132                "howard calculation",
 9133                "0",
 9134                self.code_type_map.get("String"),
 9135            )
 9136
 9137            # Update
 9138            sql_update = f"""
 9139                UPDATE variants
 9140                SET "INFO" = 
 9141                    concat(
 9142                        CASE
 9143                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9144                            THEN ''
 9145                            ELSE concat("INFO", ';')
 9146                        END,
 9147                        CASE 
 9148                            WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.')
 9149                                AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL
 9150                            THEN concat(
 9151                                    '{findbypipeline_tag}=',
 9152                                    dataframe_findbypipeline."{findbypipeline_infos}"
 9153                                )
 9154                            ELSE ''
 9155                        END
 9156                    )
 9157                FROM dataframe_findbypipeline
 9158                WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}"
 9159            """
 9160            self.conn.execute(sql_update)
 9161
 9162            # Remove added columns
 9163            for added_column in added_columns:
 9164                self.drop_column(column=added_column)
 9165
 9166            # Delete dataframe
 9167            del dataframe_findbypipeline
 9168            gc.collect()
 9169
 9170    def calculation_genotype_concordance(self) -> None:
 9171        """
 9172        The function `calculation_genotype_concordance` calculates the genotype concordance for
 9173        multi-caller VCF files and updates the variant information in the database.
 9174        """
 9175
 9176        # if FORMAT and samples
 9177        if (
 9178            "FORMAT" in self.get_header_columns_as_list()
 9179            and self.get_header_sample_list()
 9180        ):
 9181
 9182            # genotypeconcordance annotation field
 9183            genotypeconcordance_tag = "genotypeconcordance"
 9184
 9185            # VCF infos tags
 9186            vcf_infos_tags = {
 9187                genotypeconcordance_tag: "Concordance of genotype for multi caller VCF",
 9188            }
 9189
 9190            # Prefix
 9191            prefix = self.get_explode_infos_prefix()
 9192
 9193            # Field
 9194            genotypeconcordance_infos = prefix + genotypeconcordance_tag
 9195
 9196            # Variants table
 9197            table_variants = self.get_table_variants()
 9198
 9199            # Header
 9200            vcf_reader = self.get_header()
 9201
 9202            # Create variant id
 9203            variant_id_column = self.get_variant_id_column()
 9204            added_columns = [variant_id_column]
 9205
 9206            # variant_id, FORMAT and samples
 9207            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9208                self.get_header_sample_list()
 9209            )
 9210
 9211            # Create dataframe
 9212            dataframe_genotypeconcordance = self.get_query_to_df(
 9213                f""" SELECT {samples_fields} FROM {table_variants} """
 9214            )
 9215
 9216            # Create genotypeconcordance column
 9217            dataframe_genotypeconcordance[genotypeconcordance_infos] = (
 9218                dataframe_genotypeconcordance.apply(
 9219                    lambda row: genotypeconcordance(
 9220                        row, samples=self.get_header_sample_list()
 9221                    ),
 9222                    axis=1,
 9223                )
 9224            )
 9225
 9226            # Add genotypeconcordance to header
 9227            vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info(
 9228                genotypeconcordance_tag,
 9229                ".",
 9230                "String",
 9231                vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"),
 9232                "howard calculation",
 9233                "0",
 9234                self.code_type_map.get("String"),
 9235            )
 9236
 9237            # Update
 9238            sql_update = f"""
 9239                UPDATE variants
 9240                SET "INFO" = 
 9241                    concat(
 9242                        CASE
 9243                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9244                            THEN ''
 9245                            ELSE concat("INFO", ';')
 9246                        END,
 9247                        CASE
 9248                            WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.')
 9249                                AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL
 9250                            THEN concat(
 9251                                    '{genotypeconcordance_tag}=',
 9252                                    dataframe_genotypeconcordance."{genotypeconcordance_infos}"
 9253                                )
 9254                            ELSE ''
 9255                        END
 9256                    )
 9257                FROM dataframe_genotypeconcordance
 9258                WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}"
 9259            """
 9260            self.conn.execute(sql_update)
 9261
 9262            # Remove added columns
 9263            for added_column in added_columns:
 9264                self.drop_column(column=added_column)
 9265
 9266            # Delete dataframe
 9267            del dataframe_genotypeconcordance
 9268            gc.collect()
 9269
 9270    def calculation_barcode(self, tag: str = "barcode") -> None:
 9271        """
 9272        The `calculation_barcode` function calculates barcode values for variants in a VCF file and
 9273        updates the INFO field in the file with the calculated barcode values.
 9274
 9275        :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag
 9276        name that will be used for the barcode calculation in the VCF file. If no tag name is provided,
 9277        the default tag name is set to "barcode", defaults to barcode
 9278        :type tag: str (optional)
 9279        """
 9280
 9281        # if FORMAT and samples
 9282        if (
 9283            "FORMAT" in self.get_header_columns_as_list()
 9284            and self.get_header_sample_list()
 9285        ):
 9286
 9287            # barcode annotation field
 9288            if not tag:
 9289                tag = "barcode"
 9290
 9291            # VCF infos tags
 9292            vcf_infos_tags = {
 9293                tag: "barcode calculation (VaRank)",
 9294            }
 9295
 9296            # Prefix
 9297            prefix = self.get_explode_infos_prefix()
 9298
 9299            # Field
 9300            barcode_infos = prefix + tag
 9301
 9302            # Variants table
 9303            table_variants = self.get_table_variants()
 9304
 9305            # Header
 9306            vcf_reader = self.get_header()
 9307
 9308            # Create variant id
 9309            variant_id_column = self.get_variant_id_column()
 9310            added_columns = [variant_id_column]
 9311
 9312            # variant_id, FORMAT and samples
 9313            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9314                self.get_header_sample_list()
 9315            )
 9316
 9317            # Create dataframe
 9318            dataframe_barcode = self.get_query_to_df(
 9319                f""" SELECT {samples_fields} FROM {table_variants} """
 9320            )
 9321
 9322            # Create barcode column
 9323            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
 9324                lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1
 9325            )
 9326
 9327            # Add barcode to header
 9328            vcf_reader.infos[tag] = vcf.parser._Info(
 9329                tag,
 9330                ".",
 9331                "String",
 9332                vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)),
 9333                "howard calculation",
 9334                "0",
 9335                self.code_type_map.get("String"),
 9336            )
 9337
 9338            # Update
 9339            sql_update = f"""
 9340                UPDATE {table_variants}
 9341                SET "INFO" = 
 9342                    concat(
 9343                        CASE
 9344                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9345                            THEN ''
 9346                            ELSE concat("INFO", ';')
 9347                        END,
 9348                        CASE
 9349                            WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.')
 9350                            AND dataframe_barcode."{barcode_infos}" NOT NULL
 9351                            THEN concat(
 9352                                    '{tag}=',
 9353                                    dataframe_barcode."{barcode_infos}"
 9354                                )
 9355                            ELSE ''
 9356                        END
 9357                    )
 9358                FROM dataframe_barcode
 9359                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
 9360            """
 9361            self.conn.execute(sql_update)
 9362
 9363            # Remove added columns
 9364            for added_column in added_columns:
 9365                self.drop_column(column=added_column)
 9366
 9367            # Delete dataframe
 9368            del dataframe_barcode
 9369            gc.collect()
 9370
 9371    def calculation_barcode_family(self, tag: str = "BCF") -> None:
 9372        """
 9373        The `calculation_barcode_family` function calculates barcode values for variants in a VCF file
 9374        and updates the INFO field in the file with the calculated barcode values.
 9375
 9376        :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify
 9377        the barcode tag that will be added to the VCF file during the calculation process. If no value
 9378        is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF
 9379        :type tag: str (optional)
 9380        """
 9381
 9382        # if FORMAT and samples
 9383        if (
 9384            "FORMAT" in self.get_header_columns_as_list()
 9385            and self.get_header_sample_list()
 9386        ):
 9387
 9388            # barcode annotation field
 9389            if not tag:
 9390                tag = "BCF"
 9391
 9392            # VCF infos tags
 9393            vcf_infos_tags = {
 9394                tag: "barcode family calculation",
 9395                f"{tag}S": "barcode family samples",
 9396            }
 9397
 9398            # Param
 9399            param = self.get_param()
 9400            log.debug(f"param={param}")
 9401
 9402            # Prefix
 9403            prefix = self.get_explode_infos_prefix()
 9404
 9405            # PED param
 9406            ped = (
 9407                param.get("calculation", {})
 9408                .get("calculations", {})
 9409                .get("BARCODEFAMILY", {})
 9410                .get("family_pedigree", None)
 9411            )
 9412            log.debug(f"ped={ped}")
 9413
 9414            # Load PED
 9415            if ped:
 9416
 9417                # Pedigree is a file
 9418                if isinstance(ped, str) and os.path.exists(full_path(ped)):
 9419                    log.debug("Pedigree is file")
 9420                    with open(full_path(ped)) as ped:
 9421                        ped = json.load(ped)
 9422
 9423                # Pedigree is a string
 9424                elif isinstance(ped, str):
 9425                    log.debug("Pedigree is str")
 9426                    try:
 9427                        ped = json.loads(ped)
 9428                        log.debug("Pedigree is json str")
 9429                    except ValueError as e:
 9430                        ped_samples = ped.split(",")
 9431                        ped = {}
 9432                        for ped_sample in ped_samples:
 9433                            ped[ped_sample] = ped_sample
 9434
 9435                # Pedigree is a dict
 9436                elif isinstance(ped, dict):
 9437                    log.debug("Pedigree is dict")
 9438
 9439                # Pedigree is not well formatted
 9440                else:
 9441                    msg_error = "Pedigree not well formatted"
 9442                    log.error(msg_error)
 9443                    raise ValueError(msg_error)
 9444
 9445                # Construct list
 9446                ped_samples = list(ped.values())
 9447
 9448            else:
 9449                log.debug("Pedigree not defined. Take all samples")
 9450                ped_samples = self.get_header_sample_list()
 9451                ped = {}
 9452                for ped_sample in ped_samples:
 9453                    ped[ped_sample] = ped_sample
 9454
 9455            # Check pedigree
 9456            if not ped or len(ped) == 0:
 9457                msg_error = f"Error in pedigree: samples {ped_samples}"
 9458                log.error(msg_error)
 9459                raise ValueError(msg_error)
 9460
 9461            # Log
 9462            log.info(
 9463                "Calculation 'BARCODEFAMILY' - Samples: "
 9464                + ", ".join([f"{member}='{ped[member]}'" for member in ped])
 9465            )
 9466            log.debug(f"ped_samples={ped_samples}")
 9467
 9468            # Field
 9469            barcode_infos = prefix + tag
 9470
 9471            # Variants table
 9472            table_variants = self.get_table_variants()
 9473
 9474            # Header
 9475            vcf_reader = self.get_header()
 9476
 9477            # Create variant id
 9478            variant_id_column = self.get_variant_id_column()
 9479            added_columns = [variant_id_column]
 9480
 9481            # variant_id, FORMAT and samples
 9482            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9483                ped_samples
 9484            )
 9485
 9486            # Create dataframe
 9487            dataframe_barcode = self.get_query_to_df(
 9488                f""" SELECT {samples_fields} FROM {table_variants} """
 9489            )
 9490
 9491            # Create barcode column
 9492            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
 9493                lambda row: barcode(row, samples=ped_samples), axis=1
 9494            )
 9495
 9496            # Add barcode family to header
 9497            # Add vaf_normalization to header
 9498            vcf_reader.formats[tag] = vcf.parser._Format(
 9499                id=tag,
 9500                num=".",
 9501                type="String",
 9502                desc=vcf_infos_tags.get(tag, "barcode family calculation"),
 9503                type_code=self.code_type_map.get("String"),
 9504            )
 9505            vcf_reader.formats[f"{tag}S"] = vcf.parser._Format(
 9506                id=f"{tag}S",
 9507                num=".",
 9508                type="String",
 9509                desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"),
 9510                type_code=self.code_type_map.get("String"),
 9511            )
 9512
 9513            # Update
 9514            # for sample in ped_samples:
 9515            sql_update_set = []
 9516            for sample in self.get_header_sample_list() + ["FORMAT"]:
 9517                if sample in ped_samples:
 9518                    value = f'dataframe_barcode."{barcode_infos}"'
 9519                    value_samples = "'" + ",".join(ped_samples) + "'"
 9520                elif sample == "FORMAT":
 9521                    value = f"'{tag}'"
 9522                    value_samples = f"'{tag}S'"
 9523                else:
 9524                    value = "'.'"
 9525                    value_samples = "'.'"
 9526                format_regex = r"[a-zA-Z0-9\s]"
 9527                sql_update_set.append(
 9528                    f"""
 9529                        "{sample}" = 
 9530                        concat(
 9531                            CASE
 9532                                WHEN {table_variants}."{sample}" = './.'
 9533                                THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g'))
 9534                                ELSE {table_variants}."{sample}"
 9535                            END,
 9536                            ':',
 9537                            {value},
 9538                            ':',
 9539                            {value_samples}
 9540                        )
 9541                    """
 9542                )
 9543
 9544            sql_update_set_join = ", ".join(sql_update_set)
 9545            sql_update = f"""
 9546                UPDATE {table_variants}
 9547                SET {sql_update_set_join}
 9548                FROM dataframe_barcode
 9549                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
 9550            """
 9551            self.conn.execute(sql_update)
 9552
 9553            # Remove added columns
 9554            for added_column in added_columns:
 9555                self.drop_column(column=added_column)
 9556
 9557            # Delete dataframe
 9558            del dataframe_barcode
 9559            gc.collect()
 9560
 9561    def calculation_trio(self) -> None:
 9562        """
 9563        The `calculation_trio` function performs trio calculations on a VCF file by adding trio
 9564        information to the INFO field of each variant.
 9565        """
 9566
 9567        # if FORMAT and samples
 9568        if (
 9569            "FORMAT" in self.get_header_columns_as_list()
 9570            and self.get_header_sample_list()
 9571        ):
 9572
 9573            # trio annotation field
 9574            trio_tag = "trio"
 9575
 9576            # VCF infos tags
 9577            vcf_infos_tags = {
 9578                "trio": "trio calculation",
 9579            }
 9580
 9581            # Param
 9582            param = self.get_param()
 9583
 9584            # Prefix
 9585            prefix = self.get_explode_infos_prefix()
 9586
 9587            # Trio param
 9588            trio_ped = (
 9589                param.get("calculation", {})
 9590                .get("calculations", {})
 9591                .get("TRIO", {})
 9592                .get("trio_pedigree", None)
 9593            )
 9594
 9595            # Load trio
 9596            if trio_ped:
 9597
 9598                # Trio pedigree is a file
 9599                if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)):
 9600                    log.debug("TRIO pedigree is file")
 9601                    with open(full_path(trio_ped)) as trio_ped:
 9602                        trio_ped = json.load(trio_ped)
 9603
 9604                # Trio pedigree is a string
 9605                elif isinstance(trio_ped, str):
 9606                    log.debug("TRIO pedigree is str")
 9607                    try:
 9608                        trio_ped = json.loads(trio_ped)
 9609                        log.debug("TRIO pedigree is json str")
 9610                    except ValueError as e:
 9611                        trio_samples = trio_ped.split(",")
 9612                        if len(trio_samples) == 3:
 9613                            trio_ped = {
 9614                                "father": trio_samples[0],
 9615                                "mother": trio_samples[1],
 9616                                "child": trio_samples[2],
 9617                            }
 9618                            log.debug("TRIO pedigree is list str")
 9619                        else:
 9620                            msg_error = "TRIO pedigree not well formatted"
 9621                            log.error(msg_error)
 9622                            raise ValueError(msg_error)
 9623
 9624                # Trio pedigree is a dict
 9625                elif isinstance(trio_ped, dict):
 9626                    log.debug("TRIO pedigree is dict")
 9627
 9628                # Trio pedigree is not well formatted
 9629                else:
 9630                    msg_error = "TRIO pedigree not well formatted"
 9631                    log.error(msg_error)
 9632                    raise ValueError(msg_error)
 9633
 9634                # Construct trio list
 9635                trio_samples = [
 9636                    trio_ped.get("father", ""),
 9637                    trio_ped.get("mother", ""),
 9638                    trio_ped.get("child", ""),
 9639                ]
 9640
 9641            else:
 9642                log.debug("TRIO pedigree not defined. Take the first 3 samples")
 9643                samples_list = self.get_header_sample_list()
 9644                if len(samples_list) >= 3:
 9645                    trio_samples = self.get_header_sample_list()[0:3]
 9646                    trio_ped = {
 9647                        "father": trio_samples[0],
 9648                        "mother": trio_samples[1],
 9649                        "child": trio_samples[2],
 9650                    }
 9651                else:
 9652                    msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}"
 9653                    log.error(msg_error)
 9654                    raise ValueError(msg_error)
 9655
 9656            # Check trio pedigree
 9657            if not trio_ped or len(trio_ped) != 3:
 9658                msg_error = f"Error in TRIO pedigree: {trio_ped}"
 9659                log.error(msg_error)
 9660                raise ValueError(msg_error)
 9661
 9662            # Log
 9663            log.info(
 9664                f"Calculation 'TRIO' - Samples: "
 9665                + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped])
 9666            )
 9667
 9668            # Field
 9669            trio_infos = prefix + trio_tag
 9670
 9671            # Variants table
 9672            table_variants = self.get_table_variants()
 9673
 9674            # Header
 9675            vcf_reader = self.get_header()
 9676
 9677            # Create variant id
 9678            variant_id_column = self.get_variant_id_column()
 9679            added_columns = [variant_id_column]
 9680
 9681            # variant_id, FORMAT and samples
 9682            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9683                self.get_header_sample_list()
 9684            )
 9685
 9686            # Create dataframe
 9687            dataframe_trio = self.get_query_to_df(
 9688                f""" SELECT {samples_fields} FROM {table_variants} """
 9689            )
 9690
 9691            # Create trio column
 9692            dataframe_trio[trio_infos] = dataframe_trio.apply(
 9693                lambda row: trio(row, samples=trio_samples), axis=1
 9694            )
 9695
 9696            # Add trio to header
 9697            vcf_reader.infos[trio_tag] = vcf.parser._Info(
 9698                trio_tag,
 9699                ".",
 9700                "String",
 9701                vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"),
 9702                "howard calculation",
 9703                "0",
 9704                self.code_type_map.get("String"),
 9705            )
 9706
 9707            # Update
 9708            sql_update = f"""
 9709                UPDATE {table_variants}
 9710                SET "INFO" = 
 9711                    concat(
 9712                        CASE
 9713                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9714                            THEN ''
 9715                            ELSE concat("INFO", ';')
 9716                        END,
 9717                        CASE
 9718                            WHEN dataframe_trio."{trio_infos}" NOT IN ('','.')
 9719                             AND dataframe_trio."{trio_infos}" NOT NULL
 9720                            THEN concat(
 9721                                    '{trio_tag}=',
 9722                                    dataframe_trio."{trio_infos}"
 9723                                )
 9724                            ELSE ''
 9725                        END
 9726                    )
 9727                FROM dataframe_trio
 9728                WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}"
 9729            """
 9730            self.conn.execute(sql_update)
 9731
 9732            # Remove added columns
 9733            for added_column in added_columns:
 9734                self.drop_column(column=added_column)
 9735
 9736            # Delete dataframe
 9737            del dataframe_trio
 9738            gc.collect()
 9739
 9740    def calculation_vaf_normalization(self) -> None:
 9741        """
 9742        The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency)
 9743        normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly.
 9744        :return: The function does not return anything.
 9745        """
 9746
 9747        # if FORMAT and samples
 9748        if (
 9749            "FORMAT" in self.get_header_columns_as_list()
 9750            and self.get_header_sample_list()
 9751        ):
 9752
 9753            # vaf_normalization annotation field
 9754            vaf_normalization_tag = "VAF"
 9755
 9756            # VCF infos tags
 9757            vcf_infos_tags = {
 9758                "VAF": "VAF Variant Frequency",
 9759            }
 9760
 9761            # Prefix
 9762            prefix = self.get_explode_infos_prefix()
 9763
 9764            # Variants table
 9765            table_variants = self.get_table_variants()
 9766
 9767            # Header
 9768            vcf_reader = self.get_header()
 9769
 9770            # Do not calculate if VAF already exists
 9771            if "VAF" in vcf_reader.formats:
 9772                log.debug("VAF already on genotypes")
 9773                return
 9774
 9775            # Create variant id
 9776            variant_id_column = self.get_variant_id_column()
 9777            added_columns = [variant_id_column]
 9778
 9779            # variant_id, FORMAT and samples
 9780            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9781                f""" "{sample}" """ for sample in self.get_header_sample_list()
 9782            )
 9783
 9784            # Create dataframe
 9785            query = f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """
 9786            log.debug(f"query={query}")
 9787            dataframe_vaf_normalization = self.get_query_to_df(query=query)
 9788
 9789            vaf_normalization_set = []
 9790
 9791            # for each sample vaf_normalization
 9792            for sample in self.get_header_sample_list():
 9793                dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply(
 9794                    lambda row: vaf_normalization(row, sample=sample), axis=1
 9795                )
 9796                vaf_normalization_set.append(
 9797                    f""" "{sample}" = dataframe_vaf_normalization."{sample}" """
 9798                )
 9799
 9800            # Add VAF to FORMAT
 9801            dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[
 9802                "FORMAT"
 9803            ].apply(lambda x: str(x) + ":VAF")
 9804            vaf_normalization_set.append(
 9805                f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """
 9806            )
 9807
 9808            # Add vaf_normalization to header
 9809            vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format(
 9810                id=vaf_normalization_tag,
 9811                num="1",
 9812                type="Float",
 9813                desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"),
 9814                type_code=self.code_type_map.get("Float"),
 9815            )
 9816
 9817            # Create fields to add in INFO
 9818            sql_vaf_normalization_set = " , ".join(vaf_normalization_set)
 9819
 9820            # Update
 9821            sql_update = f"""
 9822                UPDATE {table_variants}
 9823                SET {sql_vaf_normalization_set}
 9824                FROM dataframe_vaf_normalization
 9825                WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}"
 9826
 9827            """
 9828            self.conn.execute(sql_update)
 9829
 9830            # Remove added columns
 9831            for added_column in added_columns:
 9832                self.drop_column(column=added_column)
 9833
 9834            # Delete dataframe
 9835            del dataframe_vaf_normalization
 9836            gc.collect()
 9837
 9838    def calculation_genotype_stats(self, info: str = "VAF") -> None:
 9839        """
 9840        The `calculation_genotype_stats` function calculates genotype statistics for a given information
 9841        field in a VCF file and updates the INFO column of the variants table with the calculated
 9842        statistics.
 9843
 9844        :param info: The `info` parameter is a string that represents the type of information for which
 9845        genotype statistics are calculated. It is used to generate various VCF info tags for the
 9846        statistics, such as the number of occurrences, the list of values, the minimum value, the
 9847        maximum value, the mean, the median, defaults to VAF
 9848        :type info: str (optional)
 9849        """
 9850
 9851        # if FORMAT and samples
 9852        if (
 9853            "FORMAT" in self.get_header_columns_as_list()
 9854            and self.get_header_sample_list()
 9855        ):
 9856
 9857            # vaf_stats annotation field
 9858            vaf_stats_tag = info + "_stats"
 9859
 9860            # VCF infos tags
 9861            vcf_infos_tags = {
 9862                info + "_stats_nb": f"genotype {info} Statistics - number of {info}",
 9863                info + "_stats_list": f"genotype {info} Statistics - list of {info}",
 9864                info + "_stats_min": f"genotype {info} Statistics - min {info}",
 9865                info + "_stats_max": f"genotype {info} Statistics - max {info}",
 9866                info + "_stats_mean": f"genotype {info} Statistics - mean {info}",
 9867                info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}",
 9868                info
 9869                + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}",
 9870            }
 9871
 9872            # Prefix
 9873            prefix = self.get_explode_infos_prefix()
 9874
 9875            # Field
 9876            vaf_stats_infos = prefix + vaf_stats_tag
 9877
 9878            # Variants table
 9879            table_variants = self.get_table_variants()
 9880
 9881            # Header
 9882            vcf_reader = self.get_header()
 9883
 9884            # Create variant id
 9885            variant_id_column = self.get_variant_id_column()
 9886            added_columns = [variant_id_column]
 9887
 9888            # variant_id, FORMAT and samples
 9889            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9890                self.get_header_sample_list()
 9891            )
 9892
 9893            # Create dataframe
 9894            dataframe_vaf_stats = self.get_query_to_df(
 9895                f""" SELECT {samples_fields} FROM {table_variants} """
 9896            )
 9897
 9898            # Create vaf_stats column
 9899            dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply(
 9900                lambda row: genotype_stats(
 9901                    row, samples=self.get_header_sample_list(), info=info
 9902                ),
 9903                axis=1,
 9904            )
 9905
 9906            # List of vcf tags
 9907            sql_vaf_stats_fields = []
 9908
 9909            # Check all VAF stats infos
 9910            for stat in vcf_infos_tags:
 9911
 9912                # Extract stats
 9913                dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply(
 9914                    lambda x: dict(x).get(stat, "")
 9915                )
 9916
 9917                # Add snpeff_hgvs to header
 9918                vcf_reader.infos[stat] = vcf.parser._Info(
 9919                    stat,
 9920                    ".",
 9921                    "String",
 9922                    vcf_infos_tags.get(stat, "genotype statistics"),
 9923                    "howard calculation",
 9924                    "0",
 9925                    self.code_type_map.get("String"),
 9926                )
 9927
 9928                if len(sql_vaf_stats_fields):
 9929                    sep = ";"
 9930                else:
 9931                    sep = ""
 9932
 9933                # Create fields to add in INFO
 9934                sql_vaf_stats_fields.append(
 9935                    f"""
 9936                        CASE
 9937                            WHEN dataframe_vaf_stats."{stat}" NOT NULL
 9938                            THEN concat(
 9939                                    '{sep}{stat}=',
 9940                                    dataframe_vaf_stats."{stat}"
 9941                                )
 9942                            ELSE ''
 9943                        END
 9944                    """
 9945                )
 9946
 9947            # SQL set for update
 9948            sql_vaf_stats_fields_set = ",  ".join(sql_vaf_stats_fields)
 9949
 9950            # Update
 9951            sql_update = f"""
 9952                UPDATE {table_variants}
 9953                SET "INFO" = 
 9954                    concat(
 9955                        CASE
 9956                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9957                            THEN ''
 9958                            ELSE concat("INFO", ';')
 9959                        END,
 9960                        {sql_vaf_stats_fields_set}
 9961                    )
 9962                FROM dataframe_vaf_stats
 9963                WHERE {table_variants}."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}"
 9964
 9965            """
 9966            self.conn.execute(sql_update)
 9967
 9968            # Remove added columns
 9969            for added_column in added_columns:
 9970                self.drop_column(column=added_column)
 9971
 9972            # Delete dataframe
 9973            del dataframe_vaf_stats
 9974            gc.collect()
 9975
 9976    def calculation_transcripts_annotation(
 9977        self, info_json: str = None, info_format: str = None
 9978    ) -> None:
 9979        """
 9980        The `calculation_transcripts_annotation` function creates a transcripts table and adds an info
 9981        field to it if transcripts are available.
 9982
 9983        :param info_json: The `info_json` parameter in the `calculation_transcripts_annotation` method
 9984        is a string parameter that represents the information field to be used in the transcripts JSON.
 9985        It is used to specify the JSON format for the transcripts information. If no value is provided
 9986        when calling the method, it defaults to "
 9987        :type info_json: str
 9988        :param info_format: The `info_format` parameter in the `calculation_transcripts_annotation`
 9989        method is a string parameter that specifies the format of the information field to be used in
 9990        the transcripts JSON. It is used to define the format of the information field
 9991        :type info_format: str
 9992        """
 9993
 9994        # Create transcripts table
 9995        transcripts_table = self.create_transcript_view()
 9996
 9997        # Add info field
 9998        if transcripts_table:
 9999            self.transcript_view_to_variants(
10000                transcripts_table=transcripts_table,
10001                transcripts_info_field_json=info_json,
10002                transcripts_info_field_format=info_format,
10003            )
10004        else:
10005            log.info("No Transcripts to process. Check param.json file configuration")
10006
10007    def calculation_transcripts_prioritization(self) -> None:
10008        """
10009        The function `calculation_transcripts_prioritization` creates a transcripts table and
10010        prioritizes transcripts based on certain criteria.
10011        """
10012
10013        # Create transcripts table
10014        transcripts_table = self.create_transcript_view()
10015
10016        # Add info field
10017        if transcripts_table:
10018            self.transcripts_prioritization(transcripts_table=transcripts_table)
10019        else:
10020            log.info("No Transcripts to process. Check param.json file configuration")
10021
10022    def calculation_transcripts_export(self) -> None:
10023        """ """
10024
10025        # Create transcripts table
10026        transcripts_table = self.create_transcript_view()
10027
10028        # Add info field
10029        if transcripts_table:
10030            self.transcripts_export(transcripts_table=transcripts_table)
10031        else:
10032            log.info("No Transcripts to process. Check param.json file configuration")
10033
10034    ###############
10035    # Transcripts #
10036    ###############
10037
10038    def transcripts_export(
10039        self, transcripts_table: str = None, param: dict = {}
10040    ) -> bool:
10041        """ """
10042
10043        log.debug("Start transcripts export...")
10044
10045        # Param
10046        if not param:
10047            param = self.get_param()
10048
10049        # Param export
10050        param_transcript_export = param.get("transcripts", {}).get("export", {})
10051
10052        # Output file
10053        transcripts_export_output = param_transcript_export.get("output", None)
10054
10055        if not param_transcript_export or not transcripts_export_output:
10056            log.warning(f"No transcriipts export parameters defined!")
10057            return False
10058
10059        # List of transcripts annotations
10060        query_describe = f"""
10061            SELECT column_name
10062            FROM (
10063                    DESCRIBE SELECT * FROM {transcripts_table}
10064                )
10065            WHERE column_name NOT IN ('#CHROM', 'POS', 'REF', 'ALT', 'INFO')
10066        """
10067        transcripts_annotations_list = list(
10068            self.get_query_to_df(query=query_describe)["column_name"]
10069        )
10070
10071        # Create transcripts table for export
10072        transcripts_table_export = f"{transcripts_table}_export_" + "".join(
10073            random.choices(string.ascii_uppercase + string.digits, k=10)
10074        )
10075        query_create_transcripts_table_export = f"""
10076            CREATE TABLE {transcripts_table_export} AS (SELECT "#CHROM", "POS", "REF", "ALT", '' AS 'INFO', {', '.join(transcripts_annotations_list)} FROM {transcripts_table})
10077        """
10078        self.execute_query(query=query_create_transcripts_table_export)
10079
10080        # Output file format
10081        transcripts_export_output_format = get_file_format(
10082            filename=transcripts_export_output
10083        )
10084
10085        # Format VCF - construct INFO
10086        if transcripts_export_output_format in ["vcf"]:
10087
10088            # Construct query update INFO and header
10089            query_update_info = []
10090            for field in transcripts_annotations_list:
10091
10092                # If field not in header
10093                if field not in self.get_header_infos_list():
10094
10095                    # Add PZ Transcript in header
10096                    self.get_header().infos[field] = vcf.parser._Info(
10097                        field,
10098                        ".",
10099                        "String",
10100                        f"Annotation '{field}' from transcript view",
10101                        "unknown",
10102                        "unknown",
10103                        0,
10104                    )
10105
10106                # Add field as INFO/tag
10107                query_update_info.append(
10108                    f"""
10109                        CASE
10110                            WHEN "{field}" IS NOT NULL
10111                            THEN concat('{field}=', "{field}", ';')    
10112                            ELSE ''     
10113                        END
10114                        """
10115                )
10116
10117            # Query param
10118            query_update_info_value = (
10119                f""" concat('',  {", ".join(query_update_info)}) """
10120            )
10121            query_export_columns = f""" "#CHROM", "POS", '.' AS 'ID', "REF", "ALT", '.' AS 'QUAL', '.' AS 'FILTER', "INFO" """
10122
10123        else:
10124
10125            # Query param
10126            query_update_info_value = f""" NULL """
10127            query_export_columns = f""" "#CHROM", "POS", "REF", "ALT", {', '.join(transcripts_annotations_list)} """
10128
10129        # Update query INFO column
10130        query_update = f"""
10131            UPDATE {transcripts_table_export}
10132            SET INFO = {query_update_info_value}
10133
10134        """
10135        self.execute_query(query=query_update)
10136
10137        # Export
10138        self.export_output(
10139            output_file=transcripts_export_output,
10140            query=f""" SELECT {query_export_columns} FROM {transcripts_table_export} """,
10141        )
10142
10143        # Drop transcripts export table
10144        query_drop_transcripts_table_export = f"""
10145            DROP TABLE {transcripts_table_export}
10146        """
10147        self.execute_query(query=query_drop_transcripts_table_export)
10148
10149    def transcripts_prioritization(
10150        self, transcripts_table: str = None, param: dict = {}
10151    ) -> bool:
10152        """
10153        The `transcripts_prioritization` function prioritizes transcripts based on certain parameters
10154        and updates the variants table with the prioritized information.
10155
10156        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name
10157        of the table containing transcripts data. If no value is provided, it defaults to "transcripts".
10158        This parameter is used to identify the table where the transcripts data is stored for the
10159        prioritization process
10160        :type transcripts_table: str
10161        :param param: The `param` parameter in the `transcripts_prioritization` method is a dictionary
10162        that contains various configuration settings for the prioritization process of transcripts. It
10163        is used to customize the behavior of the prioritization algorithm and includes settings such as
10164        the prefix for prioritization fields, default profiles, and other
10165        :type param: dict
10166        :return: The function `transcripts_prioritization` returns a boolean value `True` if the
10167        transcripts prioritization process is successfully completed, and `False` if there are any
10168        issues or if no profile is defined for transcripts prioritization.
10169        """
10170
10171        log.debug("Start transcripts prioritization...")
10172
10173        # Param
10174        if not param:
10175            param = self.get_param()
10176
10177        # Variants table
10178        table_variants = self.get_table_variants()
10179
10180        # Transcripts table
10181        if transcripts_table is None:
10182            transcripts_table = self.create_transcript_view(
10183                transcripts_table="transcripts", param=param
10184            )
10185        if transcripts_table is None:
10186            msg_err = "No Transcripts table availalble"
10187            log.error(msg_err)
10188            raise ValueError(msg_err)
10189        log.debug(f"transcripts_table={transcripts_table}")
10190
10191        # Get transcripts columns
10192        columns_as_list_query = f"""
10193            DESCRIBE {transcripts_table}
10194        """
10195        columns_as_list = list(
10196            self.get_query_to_df(columns_as_list_query)["column_name"]
10197        )
10198
10199        # Create INFO if not exists
10200        if "INFO" not in columns_as_list:
10201            query_add_info = f"""
10202                ALTER TABLE {transcripts_table} ADD COLUMN INFO STRING DEFAULT '';
10203            """
10204            self.execute_query(query_add_info)
10205
10206        # Prioritization param and Force only PZ Score and Flag
10207        pz_param = param.get("transcripts", {}).get("prioritization", {})
10208
10209        # PZ profile by default
10210        pz_profile_default = (
10211            param.get("transcripts", {}).get("prioritization", {}).get("profiles", None)
10212        )
10213
10214        # Exit if no profile
10215        if pz_profile_default is None:
10216            log.warning("No profile defined for transcripts prioritization")
10217            return False
10218
10219        # PZ fields
10220        pz_param_pzfields = {}
10221
10222        # PZ field transcripts
10223        pz_fields_transcripts = pz_param.get("pzprefix", "PTZ") + "Transcript"
10224
10225        # Add PZ Transcript in header
10226        self.get_header().infos[pz_fields_transcripts] = vcf.parser._Info(
10227            pz_fields_transcripts,
10228            ".",
10229            "String",
10230            f"Transcript selected from prioritization process, profile {pz_profile_default}",
10231            "unknown",
10232            "unknown",
10233            code_type_map["String"],
10234        )
10235
10236        # Mandatory fields
10237        pz_mandatory_fields_list = [
10238            "Score",
10239            "Flag",
10240            "Tags",
10241            "Comment",
10242            "Infos",
10243            "Class",
10244        ]
10245        pz_mandatory_fields = []
10246        for pz_mandatory_field in pz_mandatory_fields_list:
10247            pz_mandatory_fields.append(
10248                pz_param.get("pzprefix", "PTZ") + pz_mandatory_field
10249            )
10250
10251        # PZ fields in param
10252        for pz_field in pz_param.get("pzfields", []):
10253            if pz_field in pz_mandatory_fields_list:
10254                pz_param_pzfields[pz_param.get("pzprefix", "PTZ") + pz_field] = (
10255                    pz_param.get("pzprefix", "PTZ") + pz_field
10256                )
10257            else:
10258                pz_field_new = pz_param.get("pzprefix", "PTZ") + pz_field
10259                pz_param_pzfields[pz_field] = pz_field_new
10260
10261                # Add PZ Transcript in header
10262                self.get_header().infos[pz_field_new] = vcf.parser._Info(
10263                    pz_field_new,
10264                    ".",
10265                    "String",
10266                    f"Annotation '{pz_field}' from transcript selected from prioritization process, profile {pz_profile_default}",
10267                    "unknown",
10268                    "unknown",
10269                    code_type_map["String"],
10270                )
10271
10272        # PZ fields param
10273        pz_param["pzfields"] = pz_mandatory_fields
10274
10275        # Prioritization
10276        prioritization_result = self.prioritization(
10277            table=transcripts_table,
10278            pz_param=param.get("transcripts", {}).get("prioritization", {}),
10279        )
10280        if not prioritization_result:
10281            log.warning("Transcripts prioritization not processed")
10282            return False
10283
10284        # PZ fields sql query
10285        query_update_select_list = []
10286        query_update_concat_list = []
10287        query_update_order_list = []
10288        for pz_param_pzfield in set(
10289            list(pz_param_pzfields.keys()) + pz_mandatory_fields
10290        ):
10291            query_update_select_list.append(f" {pz_param_pzfield}, ")
10292
10293        for pz_param_pzfield in pz_param_pzfields:
10294            query_update_concat_list.append(
10295                f"""
10296                    , CASE 
10297                        WHEN {pz_param_pzfield} IS NOT NULL
10298                        THEN concat(';{pz_param_pzfields.get(pz_param_pzfield)}=', {pz_param_pzfield})
10299                        ELSE ''
10300                    END
10301                """
10302            )
10303
10304        # Order by
10305        pz_orders = (
10306            param.get("transcripts", {})
10307            .get("prioritization", {})
10308            .get("prioritization_transcripts_order", {})
10309        )
10310        if not pz_orders:
10311            pz_orders = {
10312                pz_param.get("pzprefix", "PTZ") + "Flag": "ASC",
10313                pz_param.get("pzprefix", "PTZ") + "Score": "DESC",
10314            }
10315        for pz_order in pz_orders:
10316            query_update_order_list.append(
10317                f""" {pz_order} {pz_orders.get(pz_order, "DESC")} """
10318            )
10319
10320        # Fields to explode
10321        fields_to_explode = (
10322            list(pz_param_pzfields.keys())
10323            + pz_mandatory_fields
10324            + list(pz_orders.keys())
10325        )
10326        # Remove transcript column as a specific transcript column
10327        if "transcript" in fields_to_explode:
10328            fields_to_explode.remove("transcript")
10329
10330        # Fields intranscripts table
10331        query_transcripts_table = f"""
10332            DESCRIBE SELECT * FROM {transcripts_table}
10333        """
10334        query_transcripts_table = self.get_query_to_df(query=query_transcripts_table)
10335
10336        # Check fields to explode
10337        for field_to_explode in fields_to_explode:
10338            if field_to_explode not in self.get_header_infos_list() + list(
10339                query_transcripts_table.column_name
10340            ):
10341                msg_err = f"INFO/{field_to_explode} NOT IN header"
10342                log.error(msg_err)
10343                raise ValueError(msg_err)
10344
10345        # Explode fields to explode
10346        self.explode_infos(
10347            table=transcripts_table,
10348            fields=fields_to_explode,
10349        )
10350
10351        # Transcript preference file
10352        transcripts_preference_file = (
10353            param.get("transcripts", {})
10354            .get("prioritization", {})
10355            .get("prioritization_transcripts", {})
10356        )
10357        transcripts_preference_file = full_path(transcripts_preference_file)
10358
10359        # Transcript preference forced
10360        transcript_preference_force = (
10361            param.get("transcripts", {})
10362            .get("prioritization", {})
10363            .get("prioritization_transcripts_force", False)
10364        )
10365        # Transcript version forced
10366        transcript_version_force = (
10367            param.get("transcripts", {})
10368            .get("prioritization", {})
10369            .get("prioritization_transcripts_version_force", False)
10370        )
10371
10372        # Transcripts Ranking
10373        if transcripts_preference_file:
10374
10375            # Transcripts file to dataframe
10376            if os.path.exists(transcripts_preference_file):
10377                transcripts_preference_dataframe = transcripts_file_to_df(
10378                    transcripts_preference_file
10379                )
10380            else:
10381                log.error(
10382                    f"Transcript file '{transcripts_preference_file}' does NOT exist"
10383                )
10384                raise ValueError(
10385                    f"Transcript file '{transcripts_preference_file}' does NOT exist"
10386                )
10387
10388            # Order by depending to transcript preference forcing
10389            if transcript_preference_force:
10390                order_by = f""" transcripts_preference.transcripts_preference_order ASC, {" , ".join(query_update_order_list)} """
10391            else:
10392                order_by = f""" {" , ".join(query_update_order_list)}, transcripts_preference.transcripts_preference_order ASC """
10393
10394            # Transcript columns joined depend on version consideration
10395            if transcript_version_force:
10396                transcripts_version_join = f""" {transcripts_table}.transcript = transcripts_preference.transcripts_preference """
10397            else:
10398                transcripts_version_join = f""" split_part({transcripts_table}.transcript, '.', 1) = split_part(transcripts_preference.transcripts_preference, '.', 1) """
10399
10400            # Query ranking for update
10401            query_update_ranking = f"""
10402                SELECT
10403                    "#CHROM", POS, REF, ALT, {transcripts_table}.transcript, {" ".join(query_update_select_list)}
10404                    ROW_NUMBER() OVER (
10405                        PARTITION BY "#CHROM", POS, REF, ALT
10406                        ORDER BY {order_by}
10407                    ) AS rn
10408                FROM {transcripts_table}
10409                LEFT JOIN 
10410                    (
10411                        SELECT transcript AS 'transcripts_preference', row_number() OVER () AS transcripts_preference_order
10412                        FROM transcripts_preference_dataframe
10413                    ) AS transcripts_preference
10414                ON {transcripts_version_join}
10415            """
10416
10417        else:
10418
10419            # Query ranking for update
10420            query_update_ranking = f"""
10421                SELECT
10422                    "#CHROM", POS, REF, ALT, transcript, {" ".join(query_update_select_list)}
10423                    ROW_NUMBER() OVER (
10424                        PARTITION BY "#CHROM", POS, REF, ALT
10425                        ORDER BY {" , ".join(query_update_order_list)}
10426                    ) AS rn
10427                FROM {transcripts_table}
10428            """
10429
10430        # Export Transcripts prioritization infos to variants table
10431        query_update = f"""
10432            WITH RankedTranscripts AS (
10433                {query_update_ranking}
10434            )
10435            UPDATE {table_variants}
10436                SET
10437                INFO = CONCAT(CASE
10438                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
10439                            THEN ''
10440                            ELSE concat("INFO", ';')
10441                        END,
10442                        concat('{pz_fields_transcripts}=', transcript {" ".join(query_update_concat_list)})
10443                        )
10444            FROM
10445                RankedTranscripts
10446            WHERE
10447                rn = 1
10448                AND variants."#CHROM" = RankedTranscripts."#CHROM"
10449                AND variants."POS" = RankedTranscripts."POS"
10450                AND variants."REF" = RankedTranscripts."REF"
10451                AND variants."ALT" = RankedTranscripts."ALT"     
10452        """
10453
10454        # log.debug(f"query_update={query_update}")
10455        self.execute_query(query=query_update)
10456
10457        # Return
10458        return True
10459
10460    def create_transcript_view_from_columns_map(
10461        self,
10462        transcripts_table: str = "transcripts",
10463        columns_maps: dict = {},
10464        added_columns: list = [],
10465        temporary_tables: list = None,
10466        annotation_fields: list = None,
10467        column_rename: dict = {},
10468        column_clean: bool = False,
10469        column_case: str = None,
10470    ) -> tuple[list, list, list]:
10471        """
10472        The `create_transcript_view_from_columns_map` function generates a temporary table view based on
10473        specified columns mapping for transcripts data.
10474
10475        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name
10476        of the table where the transcripts data is stored or will be stored in the database. This table
10477        typically contains information about transcripts such as Ensembl transcript IDs, gene names,
10478        scores, predictions, etc. It defaults to "transcripts, defaults to transcripts
10479        :type transcripts_table: str (optional)
10480        :param columns_maps: The `columns_maps` parameter is a dictionary that contains information
10481        about how to map columns from a transcripts table to create a view. Each entry in the
10482        `columns_maps` list represents a mapping configuration for a specific set of columns. It
10483        typically includes details such as the main transcript column and additional information columns
10484        :type columns_maps: dict
10485        :param added_columns: The `added_columns` parameter in the
10486        `create_transcript_view_from_columns_map` function is a list that stores the additional columns
10487        that will be added to the view being created based on the columns map provided. These columns
10488        are generated by exploding the transcript information columns along with the main transcript
10489        column
10490        :type added_columns: list
10491        :param temporary_tables: The `temporary_tables` parameter in the
10492        `create_transcript_view_from_columns_map` function is a list that stores the names of temporary
10493        tables created during the process of creating a transcript view from a columns map. These
10494        temporary tables are used to store intermediate results or transformations before the final view
10495        is generated
10496        :type temporary_tables: list
10497        :param annotation_fields: The `annotation_fields` parameter in the
10498        `create_transcript_view_from_columns_map` function is a list that stores the fields that are
10499        used for annotation in the query view creation process. These fields are extracted from the
10500        `transcripts_column` and `transcripts_infos_columns` specified in the `columns
10501        :type annotation_fields: list
10502        :param column_rename: The `column_rename` parameter in the
10503        `create_transcript_view_from_columns_map` function is a dictionary that allows you to specify
10504        custom renaming for columns during the creation of the temporary table view. This parameter
10505        provides a mapping of original column names to the desired renamed column names. By using this
10506        parameter,
10507        :type column_rename: dict
10508        :param column_clean: The `column_clean` parameter in the
10509        `create_transcript_view_from_columns_map` function is a boolean flag that determines whether the
10510        column values should be cleaned or not. If set to `True`, the column values will be cleaned by
10511        removing any non-alphanumeric characters from them. This cleaning process ensures, defaults to
10512        False
10513        :type column_clean: bool (optional)
10514        :param column_case: The `column_case` parameter in the `create_transcript_view_from_columns_map`
10515        function is used to specify the case transformation to be applied to the columns during the view
10516        creation process. It allows you to control whether the column values should be converted to
10517        lowercase, uppercase, or remain unchanged
10518        :type column_case: str
10519        :return: The `create_transcript_view_from_columns_map` function returns a tuple containing three
10520        lists: `added_columns`, `temporary_tables`, and `annotation_fields`.
10521        """
10522
10523        log.debug("Start transcrpts view creation from columns map...")
10524
10525        # "from_columns_map": [
10526        #     {
10527        #         "transcripts_column": "Ensembl_transcriptid",
10528        #         "transcripts_infos_columns": [
10529        #             "genename",
10530        #             "Ensembl_geneid",
10531        #             "LIST_S2_score",
10532        #             "LIST_S2_pred",
10533        #         ],
10534        #     },
10535        #     {
10536        #         "transcripts_column": "Ensembl_transcriptid",
10537        #         "transcripts_infos_columns": [
10538        #             "genename",
10539        #             "VARITY_R_score",
10540        #             "Aloft_pred",
10541        #         ],
10542        #     },
10543        # ],
10544
10545        # Init
10546        if temporary_tables is None:
10547            temporary_tables = []
10548        if annotation_fields is None:
10549            annotation_fields = []
10550
10551        # Variants table
10552        table_variants = self.get_table_variants()
10553
10554        for columns_map in columns_maps:
10555
10556            # Transcript column
10557            transcripts_column = columns_map.get("transcripts_column", None)
10558
10559            # Transcripts infos columns
10560            transcripts_infos_columns = columns_map.get("transcripts_infos_columns", [])
10561
10562            # Transcripts infos columns rename
10563            column_rename = columns_map.get("column_rename", column_rename)
10564
10565            # Transcripts infos columns clean
10566            column_clean = columns_map.get("column_clean", column_clean)
10567
10568            # Transcripts infos columns case
10569            column_case = columns_map.get("column_case", column_case)
10570
10571            if transcripts_column is not None:
10572
10573                # Explode
10574                added_columns += self.explode_infos(
10575                    fields=[transcripts_column] + transcripts_infos_columns
10576                )
10577
10578                # View clauses
10579                clause_select_variants = []
10580                clause_select_tanscripts = []
10581                for field in [transcripts_column] + transcripts_infos_columns:
10582
10583                    # AS field
10584                    as_field = field
10585
10586                    # Rename
10587                    if column_rename:
10588                        as_field = column_rename.get(as_field, as_field)
10589
10590                    # Clean
10591                    if column_clean:
10592                        as_field = clean_annotation_field(as_field)
10593
10594                    # Case
10595                    if column_case:
10596                        if column_case.lower() in ["lower"]:
10597                            as_field = as_field.lower()
10598                        elif column_case.lower() in ["upper"]:
10599                            as_field = as_field.upper()
10600
10601                    # Clause select Variants
10602                    clause_select_variants.append(
10603                        f""" regexp_split_to_table("{field}", ',') AS '{field}' """
10604                    )
10605
10606                    if field in [transcripts_column]:
10607                        clause_select_tanscripts.append(
10608                            f""" regexp_split_to_table("{field}", ',') AS '{field}' """
10609                        )
10610                    else:
10611                        clause_select_tanscripts.append(
10612                            f""" regexp_split_to_table("{field}", ',') AS '{as_field}' """
10613                        )
10614                        annotation_fields.append(as_field)
10615
10616                # Querey View
10617                query = f""" 
10618                    SELECT
10619                        "#CHROM", POS, REF, ALT, INFO,
10620                        "{transcripts_column}" AS 'transcript',
10621                        {", ".join(clause_select_tanscripts)}
10622                    FROM (
10623                        SELECT 
10624                            "#CHROM", POS, REF, ALT, INFO,
10625                            {", ".join(clause_select_variants)}
10626                        FROM {table_variants}
10627                        )
10628                    WHERE "{transcripts_column}" IS NOT NULL
10629                """
10630
10631                # Create temporary table
10632                temporary_table = transcripts_table + "".join(
10633                    random.choices(string.ascii_uppercase + string.digits, k=10)
10634                )
10635
10636                # Temporary_tables
10637                temporary_tables.append(temporary_table)
10638                query_view = f"""
10639                    CREATE TEMPORARY TABLE {temporary_table}
10640                    AS ({query})
10641                """
10642                self.execute_query(query=query_view)
10643
10644        return added_columns, temporary_tables, annotation_fields
10645
10646    def create_transcript_view_from_column_format(
10647        self,
10648        transcripts_table: str = "transcripts",
10649        column_formats: dict = {},
10650        temporary_tables: list = None,
10651        annotation_fields: list = None,
10652        column_rename: dict = {},
10653        column_clean: bool = False,
10654        column_case: str = None,
10655    ) -> tuple[list, list, list]:
10656        """
10657        The `create_transcript_view_from_column_format` function generates a transcript view based on
10658        specified column formats, adds additional columns and annotation fields, and returns the list of
10659        temporary tables and annotation fields.
10660
10661        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name
10662        of the table containing the transcripts data. This table will be used as the base table for
10663        creating the transcript view. The default value for this parameter is "transcripts", but you can
10664        provide a different table name if needed, defaults to transcripts
10665        :type transcripts_table: str (optional)
10666        :param column_formats: The `column_formats` parameter is a dictionary that contains information
10667        about the columns to be used for creating the transcript view. Each entry in the dictionary
10668        specifies the mapping between a transcripts column and a transcripts infos column. This
10669        parameter allows you to define how the columns from the transcripts table should be transformed
10670        or mapped
10671        :type column_formats: dict
10672        :param temporary_tables: The `temporary_tables` parameter in the
10673        `create_transcript_view_from_column_format` function is a list that stores the names of
10674        temporary views created during the process of creating a transcript view from a column format.
10675        These temporary views are used to manipulate and extract data before generating the final
10676        transcript view
10677        :type temporary_tables: list
10678        :param annotation_fields: The `annotation_fields` parameter in the
10679        `create_transcript_view_from_column_format` function is a list that stores the annotation fields
10680        that are extracted from the temporary views created during the process. These annotation fields
10681        are obtained by querying the temporary views and extracting the column names excluding specific
10682        columns like `#CH
10683        :type annotation_fields: list
10684        :param column_rename: The `column_rename` parameter in the
10685        `create_transcript_view_from_column_format` function is a dictionary that allows you to specify
10686        custom renaming of columns in the transcripts infos table. By providing a mapping of original
10687        column names to new column names in this dictionary, you can rename specific columns during the
10688        process
10689        :type column_rename: dict
10690        :param column_clean: The `column_clean` parameter in the
10691        `create_transcript_view_from_column_format` function is a boolean flag that determines whether
10692        the transcripts infos columns should undergo a cleaning process. If set to `True`, the columns
10693        will be cleaned during the creation of the transcript view based on the specified column format,
10694        defaults to False
10695        :type column_clean: bool (optional)
10696        :param column_case: The `column_case` parameter in the
10697        `create_transcript_view_from_column_format` function is used to specify the case transformation
10698        to be applied to the columns in the transcript view. It can be set to either "upper" or "lower"
10699        to convert the column names to uppercase or lowercase, respectively
10700        :type column_case: str
10701        :return: The `create_transcript_view_from_column_format` function returns two lists:
10702        `temporary_tables` and `annotation_fields`.
10703        """
10704
10705        log.debug("Start transcrpts view creation from column format...")
10706
10707        #  "from_column_format": [
10708        #     {
10709        #         "transcripts_column": "ANN",
10710        #         "transcripts_infos_column": "Feature_ID",
10711        #     }
10712        # ],
10713
10714        # Init
10715        if temporary_tables is None:
10716            temporary_tables = []
10717        if annotation_fields is None:
10718            annotation_fields = []
10719
10720        for column_format in column_formats:
10721
10722            # annotation field and transcript annotation field
10723            annotation_field = column_format.get("transcripts_column", "ANN")
10724            transcript_annotation = column_format.get(
10725                "transcripts_infos_column", "Feature_ID"
10726            )
10727
10728            # Transcripts infos columns rename
10729            column_rename = column_format.get("column_rename", column_rename)
10730
10731            # Transcripts infos columns clean
10732            column_clean = column_format.get("column_clean", column_clean)
10733
10734            # Transcripts infos columns case
10735            column_case = column_format.get("column_case", column_case)
10736
10737            # Temporary View name
10738            temporary_view_name = transcripts_table + "".join(
10739                random.choices(string.ascii_uppercase + string.digits, k=10)
10740            )
10741
10742            # Create temporary view name
10743            temporary_view_name = self.annotation_format_to_table(
10744                uniquify=True,
10745                annotation_field=annotation_field,
10746                view_name=temporary_view_name,
10747                annotation_id=transcript_annotation,
10748                column_rename=column_rename,
10749                column_clean=column_clean,
10750                column_case=column_case,
10751            )
10752
10753            # Annotation fields
10754            if temporary_view_name:
10755                query_annotation_fields = f"""
10756                    SELECT *
10757                    FROM (
10758                        DESCRIBE SELECT *
10759                        FROM {temporary_view_name}
10760                        )
10761                        WHERE column_name not in ('#CHROM', 'POS', 'REF', 'ALT')
10762                """
10763                df_annotation_fields = self.get_query_to_df(
10764                    query=query_annotation_fields
10765                )
10766
10767                # Add temporary view and annotation fields
10768                temporary_tables.append(temporary_view_name)
10769                annotation_fields += list(set(df_annotation_fields["column_name"]))
10770
10771        return temporary_tables, annotation_fields
10772
10773    def create_transcript_view(
10774        self,
10775        transcripts_table: str = None,
10776        transcripts_table_drop: bool = True,
10777        param: dict = {},
10778    ) -> str:
10779        """
10780        The `create_transcript_view` function generates a transcript view by processing data from a
10781        specified table based on provided parameters and structural information.
10782
10783        :param transcripts_table: The `transcripts_table` parameter in the `create_transcript_view` function
10784        is used to specify the name of the table that will store the final transcript view data. If a table
10785        name is not provided, the function will create a new table to store the transcript view data, and by
10786        default,, defaults to transcripts
10787        :type transcripts_table: str (optional)
10788        :param transcripts_table_drop: The `transcripts_table_drop` parameter in the
10789        `create_transcript_view` function is a boolean parameter that determines whether to drop the
10790        existing transcripts table before creating a new one. If `transcripts_table_drop` is set to `True`,
10791        the function will drop the existing transcripts table if it exists, defaults to True
10792        :type transcripts_table_drop: bool (optional)
10793        :param param: The `param` parameter in the `create_transcript_view` function is a dictionary that
10794        contains information needed to create a transcript view. It includes details such as the structure
10795        of the transcripts, columns mapping, column formats, and other necessary information for generating
10796        the view. This parameter allows for flexibility and customization
10797        :type param: dict
10798        :return: The `create_transcript_view` function returns the name of the transcripts table that was
10799        created or modified during the execution of the function.
10800        """
10801
10802        log.debug("Start transcripts view creation...")
10803
10804        # Default
10805        transcripts_table_default = "transcripts"
10806
10807        # Param
10808        if not param:
10809            param = self.get_param()
10810
10811        # Struct
10812        struct = param.get("transcripts", {}).get("struct", None)
10813
10814        # Transcript veresion
10815        transcript_id_remove_version = param.get("transcripts", {}).get(
10816            "transcript_id_remove_version", False
10817        )
10818
10819        # Transcripts mapping
10820        transcript_id_mapping_file = param.get("transcripts", {}).get(
10821            "transcript_id_mapping_file", None
10822        )
10823
10824        # Transcripts mapping
10825        transcript_id_mapping_force = param.get("transcripts", {}).get(
10826            "transcript_id_mapping_force", None
10827        )
10828
10829        if struct:
10830
10831            # Transcripts table
10832            if transcripts_table is None:
10833                transcripts_table = param.get("transcripts", {}).get(
10834                    "table", transcripts_table_default
10835                )
10836
10837            # added_columns
10838            added_columns = []
10839
10840            # Temporary tables
10841            temporary_tables = []
10842
10843            # Annotation fields
10844            annotation_fields = []
10845
10846            # from columns map
10847            columns_maps = struct.get("from_columns_map", [])
10848            added_columns_tmp, temporary_tables_tmp, annotation_fields_tmp = (
10849                self.create_transcript_view_from_columns_map(
10850                    transcripts_table=transcripts_table,
10851                    columns_maps=columns_maps,
10852                    added_columns=added_columns,
10853                    temporary_tables=temporary_tables,
10854                    annotation_fields=annotation_fields,
10855                )
10856            )
10857            added_columns += added_columns_tmp
10858            temporary_tables += temporary_tables_tmp
10859            annotation_fields += annotation_fields_tmp
10860
10861            # from column format
10862            column_formats = struct.get("from_column_format", [])
10863            temporary_tables_tmp, annotation_fields_tmp = (
10864                self.create_transcript_view_from_column_format(
10865                    transcripts_table=transcripts_table,
10866                    column_formats=column_formats,
10867                    temporary_tables=temporary_tables,
10868                    annotation_fields=annotation_fields,
10869                )
10870            )
10871            temporary_tables += temporary_tables_tmp
10872            annotation_fields += annotation_fields_tmp
10873
10874            # Remove some specific fields/column
10875            annotation_fields = list(set(annotation_fields))
10876            for field in ["#CHROM", "POS", "REF", "ALT", "INFO", "transcript"]:
10877                if field in annotation_fields:
10878                    annotation_fields.remove(field)
10879
10880            # Merge temporary tables query
10881            query_merge = ""
10882            for temporary_table in list(set(temporary_tables)):
10883
10884                # First temporary table
10885                if not query_merge:
10886                    query_merge = f"""
10887                        SELECT * FROM {temporary_table}
10888                    """
10889                # other temporary table (using UNION)
10890                else:
10891                    query_merge += f"""
10892                        UNION BY NAME SELECT * FROM {temporary_table}
10893                    """
10894
10895            # transcript table tmp
10896            transcript_table_tmp = "transcripts_tmp"
10897            transcript_table_tmp2 = "transcripts_tmp2"
10898            transcript_table_tmp3 = "transcripts_tmp3"
10899
10900            # Merge on transcript
10901            query_merge_on_transcripts_annotation_fields = []
10902
10903            # Add transcript list
10904            query_merge_on_transcripts_annotation_fields.append(
10905                f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.transcript)), 'string_agg', ',') AS transcript_list """
10906            )
10907
10908            # Aggregate all annotations fields
10909            for annotation_field in set(annotation_fields):
10910                query_merge_on_transcripts_annotation_fields.append(
10911                    f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.{annotation_field})), 'string_agg', ',') AS {annotation_field} """
10912                )
10913
10914            # Transcripts mapping
10915            if transcript_id_mapping_file:
10916
10917                # Transcript dataframe
10918                transcript_id_mapping_dataframe_name = "transcript_id_mapping_dataframe"
10919                transcript_id_mapping_dataframe = transcripts_file_to_df(
10920                    transcript_id_mapping_file, column_names=["transcript", "alias"]
10921                )
10922
10923                # Transcript version remove
10924                if transcript_id_remove_version:
10925                    query_transcript_column_select = f"split_part({transcript_table_tmp}.transcript, '.', 1) AS transcript_original, split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1) AS transcript_mapped"
10926                    query_transcript_column_group_by = f"split_part({transcript_table_tmp}.transcript, '.', 1), split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1)"
10927                    query_left_join = f"""
10928                        LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1))
10929                    """
10930                else:
10931                    query_transcript_column_select = f"{transcript_table_tmp}.transcript AS transcript_original, {transcript_id_mapping_dataframe_name}.transcript AS transcript_mapped"
10932                    query_transcript_column_group_by = f"{transcript_table_tmp}.transcript, {transcript_id_mapping_dataframe_name}.transcript"
10933                    query_left_join = f"""
10934                        LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1))
10935                    """
10936
10937                # Transcript column for group by merge
10938                query_transcript_merge_group_by = """
10939                        CASE
10940                            WHEN transcript_mapped NOT IN ('')
10941                            THEN split_part(transcript_mapped, '.', 1)
10942                            ELSE split_part(transcript_original, '.', 1)
10943                        END
10944                    """
10945
10946                # Merge query
10947                transcripts_tmp2_query = f"""
10948                    SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_select}, {", ".join(query_merge_on_transcripts_annotation_fields)}
10949                    FROM ({query_merge}) AS {transcript_table_tmp}
10950                    {query_left_join}
10951                    GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_group_by}
10952                """
10953
10954                # Retrive columns after mege
10955                transcripts_tmp2_describe_query = f"""
10956                    DESCRIBE {transcripts_tmp2_query}
10957                """
10958                transcripts_tmp2_describe_list = list(
10959                    self.get_query_to_df(query=transcripts_tmp2_describe_query)[
10960                        "column_name"
10961                    ]
10962                )
10963
10964                # Create list of columns for select clause
10965                transcripts_tmp2_describe_select_clause = []
10966                for field in transcripts_tmp2_describe_list:
10967                    if field not in [
10968                        "#CHROM",
10969                        "POS",
10970                        "REF",
10971                        "ALT",
10972                        "INFO",
10973                        "transcript_mapped",
10974                    ]:
10975                        as_field = field
10976                        if field in ["transcript_original"]:
10977                            as_field = "transcripts_mapped"
10978                        transcripts_tmp2_describe_select_clause.append(
10979                            f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp2}.{field})), 'string_agg', ',') AS {as_field} """
10980                        )
10981
10982                # Merge with mapping
10983                query_merge_on_transcripts = f"""
10984                    SELECT
10985                        "#CHROM", POS, REF, ALT, INFO,
10986                        CASE
10987                            WHEN ANY_VALUE(transcript_mapped) NOT IN ('')
10988                            THEN ANY_VALUE(transcript_mapped)
10989                            ELSE ANY_VALUE(transcript_original)
10990                        END AS transcript,
10991                        {", ".join(transcripts_tmp2_describe_select_clause)}
10992                    FROM ({transcripts_tmp2_query}) AS {transcript_table_tmp2}
10993                    GROUP BY "#CHROM", POS, REF, ALT, INFO,
10994                        {query_transcript_merge_group_by}
10995                """
10996
10997                # Add transcript filter from mapping file
10998                if transcript_id_mapping_force:
10999                    query_merge_on_transcripts = f"""
11000                        SELECT *
11001                        FROM ({query_merge_on_transcripts}) AS {transcript_table_tmp3}
11002                        WHERE split_part({transcript_table_tmp3}.transcript, '.', 1) in (SELECT split_part(transcript, '.', 1) FROM transcript_id_mapping_dataframe)
11003                    """
11004
11005            # No transcript mapping
11006            else:
11007
11008                # Remove transcript version
11009                if transcript_id_remove_version:
11010                    query_transcript_column = f"""
11011                        split_part({transcript_table_tmp}.transcript, '.', 1)
11012                    """
11013                else:
11014                    query_transcript_column = """
11015                        transcript
11016                    """
11017
11018                # Query sections
11019                query_transcript_column_select = (
11020                    f"{query_transcript_column} AS transcript"
11021                )
11022                query_transcript_column_group_by = query_transcript_column
11023
11024                # Query for transcripts view
11025                query_merge_on_transcripts = f"""
11026                    SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column} AS transcript, NULL AS transcript_mapped, {", ".join(query_merge_on_transcripts_annotation_fields)}
11027                    FROM ({query_merge}) AS {transcript_table_tmp}
11028                    GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column}
11029                """
11030
11031            log.debug(f"query_merge_on_transcripts={query_merge_on_transcripts}")
11032
11033            # Drop transcript view is necessary
11034            if transcripts_table_drop:
11035                query_drop = f"""
11036                    DROP TABLE IF EXISTS {transcripts_table};
11037                """
11038                self.execute_query(query=query_drop)
11039
11040            # Merge and create transcript view
11041            query_create_view = f"""
11042                CREATE TABLE IF NOT EXISTS {transcripts_table}
11043                AS {query_merge_on_transcripts}
11044            """
11045            self.execute_query(query=query_create_view)
11046
11047            # Remove added columns
11048            for added_column in added_columns:
11049                self.drop_column(column=added_column)
11050
11051        else:
11052
11053            transcripts_table = None
11054
11055        return transcripts_table
11056
11057    def annotation_format_to_table(
11058        self,
11059        uniquify: bool = True,
11060        annotation_field: str = "ANN",
11061        annotation_id: str = "Feature_ID",
11062        view_name: str = "transcripts",
11063        column_rename: dict = {},
11064        column_clean: bool = False,
11065        column_case: str = None,
11066    ) -> str:
11067        """
11068        The `annotation_format_to_table` function converts annotation data from a VCF file into a
11069        structured table format, ensuring unique values and creating a temporary table for further
11070        processing or analysis.
11071
11072        :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure
11073        unique values in the output or not. If set to `True`, the function will make sure that the
11074        output values are unique, defaults to True
11075        :type uniquify: bool (optional)
11076        :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file
11077        that contains the annotation information for each variant. This field is used to extract the
11078        annotation details for further processing in the function. By default, it is set to "ANN",
11079        defaults to ANN
11080        :type annotation_field: str (optional)
11081        :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method
11082        is used to specify the identifier for the annotation feature. This identifier will be used as a
11083        column name in the resulting table or view that is created based on the annotation data. It
11084        helps in uniquely identifying each annotation entry in the, defaults to Feature_ID
11085        :type annotation_id: str (optional)
11086        :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used
11087        to specify the name of the temporary table that will be created to store the transformed
11088        annotation data. This table will hold the extracted information from the annotation field in a
11089        structured format for further processing or analysis. By default,, defaults to transcripts
11090        :type view_name: str (optional)
11091        :param column_rename: The `column_rename` parameter in the `annotation_format_to_table` method
11092        is a dictionary that allows you to specify custom renaming for columns. By providing key-value
11093        pairs in this dictionary, you can rename specific columns in the resulting table or view that is
11094        created based on the annotation data. This feature enables
11095        :type column_rename: dict
11096        :param column_clean: The `column_clean` parameter in the `annotation_format_to_table` method is
11097        a boolean flag that determines whether the annotation field should undergo a cleaning process.
11098        If set to `True`, the function will clean the annotation field before further processing. This
11099        cleaning step may involve removing any unwanted characters, formatting inconsistencies, defaults
11100        to False
11101        :type column_clean: bool (optional)
11102        :param column_case: The `column_case` parameter in the `annotation_format_to_table` method is
11103        used to specify the case transformation to be applied to the column names extracted from the
11104        annotation data. It allows you to set the case of the column names to either lowercase or
11105        uppercase for consistency or other specific requirements during the conversion
11106        :type column_case: str
11107        :return: The function `annotation_format_to_table` is returning the name of the view created,
11108        which is stored in the variable `view_name`.
11109        """
11110
11111        # Annotation field
11112        annotation_format = "annotation_explode"
11113
11114        # Transcript annotation
11115        if column_rename:
11116            annotation_id = column_rename.get(annotation_id, annotation_id)
11117
11118        if column_clean:
11119            annotation_id = clean_annotation_field(annotation_id)
11120
11121        # Prefix
11122        prefix = self.get_explode_infos_prefix()
11123        if prefix:
11124            prefix = "INFO/"
11125
11126        # Annotation fields
11127        annotation_infos = prefix + annotation_field
11128        annotation_format_infos = prefix + annotation_format
11129
11130        # Variants table
11131        table_variants = self.get_table_variants()
11132
11133        # Header
11134        vcf_reader = self.get_header()
11135
11136        # Add columns
11137        added_columns = []
11138
11139        # Explode HGVS field in column
11140        added_columns += self.explode_infos(fields=[annotation_field])
11141
11142        if annotation_field in vcf_reader.infos:
11143
11144            # Extract ANN header
11145            ann_description = vcf_reader.infos[annotation_field].desc
11146            pattern = r"'(.+?)'"
11147            match = re.search(pattern, ann_description)
11148            if match:
11149                ann_header_match = match.group(1).split(" | ")
11150                ann_header = []
11151                ann_header_desc = {}
11152                for i in range(len(ann_header_match)):
11153                    ann_header_info = "".join(
11154                        char for char in ann_header_match[i] if char.isalnum()
11155                    )
11156                    ann_header.append(ann_header_info)
11157                    ann_header_desc[ann_header_info] = ann_header_match[i]
11158                if not ann_header_desc:
11159                    raise ValueError("Invalid header description format")
11160            else:
11161                raise ValueError("Invalid header description format")
11162
11163            # Create variant id
11164            variant_id_column = self.get_variant_id_column()
11165            added_columns += [variant_id_column]
11166
11167            # Create dataframe
11168            dataframe_annotation_format = self.get_query_to_df(
11169                f""" SELECT "#CHROM", POS, REF, ALT, INFO, "{variant_id_column}", "{annotation_infos}" FROM {table_variants} """
11170            )
11171
11172            # Create annotation columns
11173            dataframe_annotation_format[
11174                annotation_format_infos
11175            ] = dataframe_annotation_format[annotation_infos].apply(
11176                lambda x: explode_annotation_format(
11177                    annotation=str(x),
11178                    uniquify=uniquify,
11179                    output_format="JSON",
11180                    prefix="",
11181                    header=list(ann_header_desc.values()),
11182                )
11183            )
11184
11185            # Find keys
11186            query_json = f"""SELECT distinct(unnest(json_keys({annotation_format}, '$.0'))) AS 'key' FROM dataframe_annotation_format;"""
11187            df_keys = self.get_query_to_df(query=query_json)
11188
11189            # Check keys
11190            query_json_key = []
11191            for _, row in df_keys.iterrows():
11192
11193                # Key
11194                key = row.iloc[0]
11195                key_clean = key
11196
11197                # key rename
11198                if column_rename:
11199                    key_clean = column_rename.get(key_clean, key_clean)
11200
11201                # key clean
11202                if column_clean:
11203                    key_clean = clean_annotation_field(key_clean)
11204
11205                # Key case
11206                if column_case:
11207                    if column_case.lower() in ["lower"]:
11208                        key_clean = key_clean.lower()
11209                    elif column_case.lower() in ["upper"]:
11210                        key_clean = key_clean.upper()
11211
11212                # Type
11213                query_json_type = f"""SELECT unnest(json_extract_string({annotation_format}, '$.*."{key}"')) AS '{key_clean}' FROM dataframe_annotation_format WHERE trim('{key}') NOT IN ('');"""
11214
11215                # Get DataFrame from query
11216                df_json_type = self.get_query_to_df(query=query_json_type)
11217
11218                # Fill missing values with empty strings and then replace empty strings or None with NaN and drop rows with NaN
11219                with pd.option_context("future.no_silent_downcasting", True):
11220                    df_json_type.fillna(value="", inplace=True)
11221                    replace_dict = {None: np.nan, "": np.nan}
11222                    df_json_type.replace(replace_dict, inplace=True)
11223                    df_json_type.dropna(inplace=True)
11224
11225                # Detect column type
11226                column_type = detect_column_type(df_json_type[key_clean])
11227
11228                # Append
11229                query_json_key.append(
11230                    f"""NULLIF(unnest(json_extract_string({annotation_format}, '$.*."{key}"')), '')::{column_type}  AS '{prefix}{key_clean}' """
11231                )
11232
11233            # Create view
11234            query_view = f"""
11235                CREATE TEMPORARY TABLE {view_name}
11236                AS (
11237                    SELECT *, {annotation_id} AS 'transcript'
11238                    FROM (
11239                        SELECT "#CHROM", POS, REF, ALT, INFO, {",".join(query_json_key)}
11240                        FROM dataframe_annotation_format
11241                        )
11242                    );
11243            """
11244            self.execute_query(query=query_view)
11245
11246        else:
11247
11248            # Return None
11249            view_name = None
11250
11251        # Remove added columns
11252        for added_column in added_columns:
11253            self.drop_column(column=added_column)
11254
11255        return view_name
11256
11257    def transcript_view_to_variants(
11258        self,
11259        transcripts_table: str = None,
11260        transcripts_column_id: str = None,
11261        transcripts_info_json: str = None,
11262        transcripts_info_field_json: str = None,
11263        transcripts_info_format: str = None,
11264        transcripts_info_field_format: str = None,
11265        param: dict = {},
11266    ) -> bool:
11267        """
11268        The `transcript_view_to_variants` function updates a variants table with information from
11269        transcripts in JSON format.
11270
11271        :param transcripts_table: The `transcripts_table` parameter is used to specify the name of the
11272        table containing the transcripts data. If this parameter is not provided, the function will
11273        attempt to retrieve it from the `param` dictionary or use a default value of "transcripts"
11274        :type transcripts_table: str
11275        :param transcripts_column_id: The `transcripts_column_id` parameter is used to specify the
11276        column in the `transcripts_table` that contains the unique identifier for each transcript. This
11277        identifier is used to match transcripts with variants in the database
11278        :type transcripts_column_id: str
11279        :param transcripts_info_json: The `transcripts_info_json` parameter is used to specify the name
11280        of the column in the variants table where the transcripts information will be stored in JSON
11281        format. This parameter allows you to define the column in the variants table that will hold the
11282        JSON-formatted information about transcripts
11283        :type transcripts_info_json: str
11284        :param transcripts_info_field_json: The `transcripts_info_field_json` parameter is used to
11285        specify the field in the VCF header that will contain information about transcripts in JSON
11286        format. This field will be added to the VCF header as an INFO field with the specified name
11287        :type transcripts_info_field_json: str
11288        :param transcripts_info_format: The `transcripts_info_format` parameter is used to specify the
11289        format of the information about transcripts that will be stored in the variants table. This
11290        format can be used to define how the transcript information will be structured or displayed
11291        within the variants table
11292        :type transcripts_info_format: str
11293        :param transcripts_info_field_format: The `transcripts_info_field_format` parameter is used to
11294        specify the field in the VCF header that will contain information about transcripts in a
11295        specific format. This field will be added to the VCF header as an INFO field with the specified
11296        name
11297        :type transcripts_info_field_format: str
11298        :param param: The `param` parameter in the `transcript_view_to_variants` method is a dictionary
11299        that contains various configuration settings related to transcripts. It is used to provide
11300        default values for certain parameters if they are not explicitly provided when calling the
11301        method. The `param` dictionary can be passed as an argument
11302        :type param: dict
11303        :return: The function `transcript_view_to_variants` returns a boolean value. It returns `True`
11304        if the operation is successful and `False` if certain conditions are not met.
11305        """
11306
11307        msg_info_prefix = "Start transcripts view to variants annotations"
11308
11309        log.debug(f"{msg_info_prefix}...")
11310
11311        # Default
11312        transcripts_table_default = "transcripts"
11313        transcripts_column_id_default = "transcript"
11314        transcripts_info_json_default = None
11315        transcripts_info_format_default = None
11316        transcripts_info_field_json_default = None
11317        transcripts_info_field_format_default = None
11318
11319        # Param
11320        if not param:
11321            param = self.get_param()
11322
11323        # Transcripts table
11324        if transcripts_table is None:
11325            transcripts_table = param.get("transcripts", {}).get(
11326                "table", transcripts_table_default
11327            )
11328
11329        # Transcripts column ID
11330        if transcripts_column_id is None:
11331            transcripts_column_id = param.get("transcripts", {}).get(
11332                "column_id", transcripts_column_id_default
11333            )
11334
11335        # Transcripts info json
11336        if transcripts_info_json is None:
11337            transcripts_info_json = param.get("transcripts", {}).get(
11338                "transcripts_info_json", transcripts_info_json_default
11339            )
11340
11341        # Transcripts info field JSON
11342        if transcripts_info_field_json is None:
11343            transcripts_info_field_json = param.get("transcripts", {}).get(
11344                "transcripts_info_field_json", transcripts_info_field_json_default
11345            )
11346        # if transcripts_info_field_json is not None and transcripts_info_json is None:
11347        #     transcripts_info_json = transcripts_info_field_json
11348
11349        # Transcripts info format
11350        if transcripts_info_format is None:
11351            transcripts_info_format = param.get("transcripts", {}).get(
11352                "transcripts_info_format", transcripts_info_format_default
11353            )
11354
11355        # Transcripts info field FORMAT
11356        if transcripts_info_field_format is None:
11357            transcripts_info_field_format = param.get("transcripts", {}).get(
11358                "transcripts_info_field_format", transcripts_info_field_format_default
11359            )
11360        # if (
11361        #     transcripts_info_field_format is not None
11362        #     and transcripts_info_format is None
11363        # ):
11364        #     transcripts_info_format = transcripts_info_field_format
11365
11366        # Variants table
11367        table_variants = self.get_table_variants()
11368
11369        # Check info columns param
11370        if (
11371            transcripts_info_json is None
11372            and transcripts_info_field_json is None
11373            and transcripts_info_format is None
11374            and transcripts_info_field_format is None
11375        ):
11376            return False
11377
11378        # Transcripts infos columns
11379        query_transcripts_infos_columns = f"""
11380            SELECT *
11381            FROM (
11382                DESCRIBE SELECT * FROM {transcripts_table}
11383                )
11384            WHERE "column_name" NOT IN ('#CHROM', 'POS', 'REF', 'ALT', '{transcripts_column_id}')
11385        """
11386        transcripts_infos_columns = list(
11387            self.get_query_to_df(query=query_transcripts_infos_columns)["column_name"]
11388        )
11389
11390        # View results
11391        clause_select = []
11392        clause_to_json = []
11393        clause_to_format = []
11394        for field in transcripts_infos_columns:
11395            # Do not consider INFO field for export into fields
11396            if field not in ["INFO"]:
11397                clause_select.append(
11398                    f""" regexp_split_to_table(CAST("{field}" AS STRING), ',') AS '{field}' """
11399                )
11400                clause_to_json.append(f""" '{field}': "{field}" """)
11401                clause_to_format.append(f""" "{field}" """)
11402
11403        # Update
11404        update_set_json = []
11405        update_set_format = []
11406
11407        # VCF header
11408        vcf_reader = self.get_header()
11409
11410        # Transcripts to info column in JSON
11411        if transcripts_info_json:
11412
11413            # Create column on variants table
11414            self.add_column(
11415                table_name=table_variants,
11416                column_name=transcripts_info_json,
11417                column_type="JSON",
11418                default_value=None,
11419                drop=False,
11420            )
11421
11422            # Add header
11423            vcf_reader.infos[transcripts_info_json] = vcf.parser._Info(
11424                transcripts_info_json,
11425                ".",
11426                "String",
11427                "Transcripts in JSON format",
11428                "unknwon",
11429                "unknwon",
11430                self.code_type_map["String"],
11431            )
11432
11433            # Add to update
11434            update_set_json.append(
11435                f""" {transcripts_info_json}=t.{transcripts_info_json} """
11436            )
11437
11438        # Transcripts to info field in JSON
11439        if transcripts_info_field_json:
11440
11441            log.debug(f"{msg_info_prefix} - Annotation in JSON format...")
11442
11443            # Add to update
11444            update_set_json.append(
11445                f""" 
11446                    INFO = concat(
11447                            CASE
11448                                WHEN INFO NOT IN ('', '.')
11449                                THEN INFO
11450                                ELSE ''
11451                            END,
11452                            CASE
11453                                WHEN CAST(t.{transcripts_info_json} AS VARCHAR) NOT IN ('', '.')
11454                                THEN concat(
11455                                    ';{transcripts_info_field_json}=',
11456                                    t.{transcripts_info_json}
11457                                )
11458                                ELSE ''
11459                            END
11460                            )
11461                """
11462            )
11463
11464            # Add header
11465            vcf_reader.infos[transcripts_info_field_json] = vcf.parser._Info(
11466                transcripts_info_field_json,
11467                ".",
11468                "String",
11469                "Transcripts in JSON format",
11470                "unknwon",
11471                "unknwon",
11472                self.code_type_map["String"],
11473            )
11474
11475        if update_set_json:
11476
11477            # Update query
11478            query_update = f"""
11479                UPDATE {table_variants}
11480                    SET {", ".join(update_set_json)}
11481                FROM
11482                (
11483                    SELECT
11484                        "#CHROM", POS, REF, ALT,
11485                            concat(
11486                            '{{',
11487                            string_agg(
11488                                '"' || "{transcripts_column_id}" || '":' ||
11489                                to_json(json_output)
11490                            ),
11491                            '}}'
11492                            )::JSON AS {transcripts_info_json}
11493                    FROM
11494                        (
11495                        SELECT
11496                            "#CHROM", POS, REF, ALT,
11497                            "{transcripts_column_id}",
11498                            to_json(
11499                                {{{",".join(clause_to_json)}}}
11500                            )::JSON AS json_output
11501                        FROM
11502                            (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table})
11503                        WHERE "{transcripts_column_id}" IS NOT NULL
11504                        )
11505                    GROUP BY "#CHROM", POS, REF, ALT
11506                ) AS t
11507                WHERE {table_variants}."#CHROM" = t."#CHROM"
11508                    AND {table_variants}."POS" = t."POS"
11509                    AND {table_variants}."REF" = t."REF"
11510                    AND {table_variants}."ALT" = t."ALT"
11511            """
11512
11513            self.execute_query(query=query_update)
11514
11515        # Transcripts to info column in FORMAT
11516        if transcripts_info_format:
11517
11518            # Create column on variants table
11519            self.add_column(
11520                table_name=table_variants,
11521                column_name=transcripts_info_format,
11522                column_type="VARCHAR",
11523                default_value=None,
11524                drop=False,
11525            )
11526
11527            # Add header
11528            vcf_reader.infos[transcripts_info_format] = vcf.parser._Info(
11529                transcripts_info_format,
11530                ".",
11531                "String",
11532                f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'",
11533                "unknwon",
11534                "unknwon",
11535                self.code_type_map["String"],
11536            )
11537
11538            # Add to update
11539            update_set_format.append(
11540                f""" {transcripts_info_format}=t.{transcripts_info_format} """
11541            )
11542
11543        else:
11544
11545            # Set variable for internal queries
11546            transcripts_info_format = "transcripts_info_format"
11547
11548        # Transcripts to info field in JSON
11549        if transcripts_info_field_format:
11550
11551            log.debug(f"{msg_info_prefix} - Annotation in structured format...")
11552
11553            # Add to update
11554            update_set_format.append(
11555                f""" 
11556                    INFO = concat(
11557                            CASE
11558                                WHEN INFO NOT IN ('', '.')
11559                                THEN INFO
11560                                ELSE ''
11561                            END,
11562                            CASE
11563                                WHEN CAST(t.{transcripts_info_format} AS VARCHAR) NOT IN ('', '.')
11564                                THEN concat(
11565                                    ';{transcripts_info_field_format}=',
11566                                    t.{transcripts_info_format}
11567                                )
11568                                ELSE ''
11569                            END
11570                            )
11571                """
11572            )
11573
11574            # Add header
11575            vcf_reader.infos[transcripts_info_field_format] = vcf.parser._Info(
11576                transcripts_info_field_format,
11577                ".",
11578                "String",
11579                f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'",
11580                "unknwon",
11581                "unknwon",
11582                self.code_type_map["String"],
11583            )
11584
11585        if update_set_format:
11586
11587            # Update query
11588            query_update = f"""
11589                UPDATE {table_variants}
11590                    SET {", ".join(update_set_format)}
11591                FROM
11592                (
11593                    SELECT
11594                        "#CHROM", POS, REF, ALT,
11595                            string_agg({transcripts_info_format}) AS {transcripts_info_format}
11596                    FROM 
11597                        (
11598                        SELECT
11599                            "#CHROM", POS, REF, ALT,
11600                            "{transcripts_column_id}",
11601                            concat(
11602                                "{transcripts_column_id}",
11603                                '|',
11604                                {", '|', ".join(clause_to_format)}
11605                            ) AS {transcripts_info_format}
11606                        FROM
11607                            (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table})
11608                        )
11609                    GROUP BY "#CHROM", POS, REF, ALT
11610                ) AS t
11611                WHERE {table_variants}."#CHROM" = t."#CHROM"
11612                    AND {table_variants}."POS" = t."POS"
11613                    AND {table_variants}."REF" = t."REF"
11614                    AND {table_variants}."ALT" = t."ALT"
11615            """
11616
11617            self.execute_query(query=query_update)
11618
11619        return True
Variants( conn=None, input: str = None, output: str = None, config: dict = {}, param: dict = {}, load: bool = False)
38    def __init__(
39        self,
40        conn=None,
41        input: str = None,
42        output: str = None,
43        config: dict = {},
44        param: dict = {},
45        load: bool = False,
46    ) -> None:
47        """
48        The function `__init__` initializes the variables, sets the input, output, config, param, connexion and
49        header
50
51        :param conn: the connection to the database
52        :param input: the input file
53        :param output: the output file
54        :param config: a dictionary containing the configuration of the model
55        :param param: a dictionary containing the parameters of the model
56        """
57
58        # Init variables
59        self.init_variables()
60
61        # Input
62        self.set_input(input)
63
64        # Config
65        self.set_config(config)
66
67        # Param
68        self.set_param(param)
69
70        # Output
71        self.set_output(output)
72
73        # connexion
74        self.set_connexion(conn)
75
76        # Header
77        self.set_header()
78
79        # Samples
80        self.set_samples()
81
82        # Load data
83        if load:
84            self.load_data()

The function __init__ initializes the variables, sets the input, output, config, param, connexion and header

Parameters
  • conn: the connection to the database
  • input: the input file
  • output: the output file
  • config: a dictionary containing the configuration of the model
  • param: a dictionary containing the parameters of the model
def set_samples(self, samples: list = None) -> list:
 86    def set_samples(self, samples: list = None) -> list:
 87        """
 88        The function `set_samples` sets the samples attribute of an object to a provided list or
 89        retrieves it from a parameter dictionary.
 90
 91        :param samples: The `set_samples` method is a method of a class that takes a list of samples as
 92        input and sets the `samples` attribute of the class to the provided list. If no samples are
 93        provided, it tries to get the samples from the class's parameters using the `get_param` method
 94        :type samples: list
 95        :return: The `samples` list is being returned.
 96        """
 97
 98        if not samples:
 99            samples = self.get_param().get("samples", {}).get("list", None)
100
101        self.samples = samples
102
103        return samples

The function set_samples sets the samples attribute of an object to a provided list or retrieves it from a parameter dictionary.

Parameters
  • samples: The set_samples method is a method of a class that takes a list of samples as input and sets the samples attribute of the class to the provided list. If no samples are provided, it tries to get the samples from the class's parameters using the get_param method
Returns

The samples list is being returned.

def get_samples(self) -> list:
105    def get_samples(self) -> list:
106        """
107        This function returns a list of samples.
108        :return: The `get_samples` method is returning the `samples` attribute of the object.
109        """
110
111        return self.samples

This function returns a list of samples.

Returns

The get_samples method is returning the samples attribute of the object.

def get_samples_check(self) -> bool:
113    def get_samples_check(self) -> bool:
114        """
115        This function returns the value of the "check" key within the "samples" dictionary retrieved
116        from the parameters.
117        :return: The method `get_samples_check` is returning the value of the key "check" inside the
118        "samples" dictionary, which is nested inside the dictionary returned by the `get_param()`
119        method. If the key "check" is not found, it will return `False`.
120        """
121
122        return self.get_param().get("samples", {}).get("check", True)

This function returns the value of the "check" key within the "samples" dictionary retrieved from the parameters.

Returns

The method get_samples_check is returning the value of the key "check" inside the "samples" dictionary, which is nested inside the dictionary returned by the get_param() method. If the key "check" is not found, it will return False.

def set_input(self, input: str = None) -> None:
124    def set_input(self, input: str = None) -> None:
125        """
126        The function `set_input` takes a file name as input, extracts the name and extension, and sets
127        attributes in the class accordingly.
128
129        :param input: The `set_input` method in the provided code snippet is used to set attributes
130        related to the input file. Here's a breakdown of the parameters and their usage in the method:
131        :type input: str
132        """
133
134        if input and not isinstance(input, str):
135            try:
136                self.input = input.name
137            except:
138                log.error(f"Input file '{input} in bad format")
139                raise ValueError(f"Input file '{input} in bad format")
140        else:
141            self.input = input
142
143        # Input format
144        if input:
145            input_name, input_extension = os.path.splitext(self.input)
146            self.input_name = input_name
147            self.input_extension = input_extension
148            self.input_format = self.input_extension.replace(".", "")

The function set_input takes a file name as input, extracts the name and extension, and sets attributes in the class accordingly.

Parameters
  • input: The set_input method in the provided code snippet is used to set attributes related to the input file. Here's a breakdown of the parameters and their usage in the method:
def set_config(self, config: dict) -> None:
150    def set_config(self, config: dict) -> None:
151        """
152        The set_config function takes a config object and assigns it as the configuration object for the
153        class.
154
155        :param config: The `config` parameter in the `set_config` function is a dictionary object that
156        contains configuration settings for the class. When you call the `set_config` function with a
157        dictionary object as the argument, it will set that dictionary as the configuration object for
158        the class
159        :type config: dict
160        """
161
162        self.config = config

The set_config function takes a config object and assigns it as the configuration object for the class.

Parameters
  • config: The config parameter in the set_config function is a dictionary object that contains configuration settings for the class. When you call the set_config function with a dictionary object as the argument, it will set that dictionary as the configuration object for the class
def set_param(self, param: dict) -> None:
164    def set_param(self, param: dict) -> None:
165        """
166        This function sets a parameter object for the class based on the input dictionary.
167
168        :param param: The `set_param` method you provided takes a dictionary object as input and sets it
169        as the `param` attribute of the class instance
170        :type param: dict
171        """
172
173        self.param = param

This function sets a parameter object for the class based on the input dictionary.

Parameters
  • param: The set_param method you provided takes a dictionary object as input and sets it as the param attribute of the class instance
def init_variables(self) -> None:
175    def init_variables(self) -> None:
176        """
177        This function initializes the variables that will be used in the rest of the class
178        """
179
180        self.prefix = "howard"
181        self.table_variants = "variants"
182        self.dataframe = None
183
184        self.comparison_map = {
185            "gt": ">",
186            "gte": ">=",
187            "lt": "<",
188            "lte": "<=",
189            "equals": "=",
190            "contains": "SIMILAR TO",
191        }
192
193        self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3}
194
195        self.code_type_map_to_sql = {
196            "Integer": "INTEGER",
197            "String": "VARCHAR",
198            "Float": "FLOAT",
199            "Flag": "VARCHAR",
200        }
201
202        self.index_additionnal_fields = []

This function initializes the variables that will be used in the rest of the class

def get_indexing(self) -> bool:
204    def get_indexing(self) -> bool:
205        """
206        It returns the value of the key "indexing" in the dictionary. If the key is not present, it
207        returns False.
208        :return: The value of the indexing parameter.
209        """
210
211        return self.get_param().get("indexing", False)

It returns the value of the key "indexing" in the dictionary. If the key is not present, it returns False.

Returns

The value of the indexing parameter.

def get_connexion_config(self) -> dict:
213    def get_connexion_config(self) -> dict:
214        """
215        The function `get_connexion_config` returns a dictionary containing the configuration for a
216        connection, including the number of threads and memory limit.
217        :return: a dictionary containing the configuration for the Connexion library.
218        """
219
220        # config
221        config = self.get_config()
222
223        # Connexion config
224        connexion_config = {}
225        threads = self.get_threads()
226
227        # Threads
228        if threads:
229            connexion_config["threads"] = threads
230
231        # Memory
232        # if config.get("memory", None):
233        #     connexion_config["memory_limit"] = config.get("memory")
234        if self.get_memory():
235            connexion_config["memory_limit"] = self.get_memory()
236
237        # Temporary directory
238        if config.get("tmp", None):
239            connexion_config["temp_directory"] = config.get("tmp")
240
241        # Access
242        if config.get("access", None):
243            access = config.get("access")
244            if access in ["RO"]:
245                access = "READ_ONLY"
246            elif access in ["RW"]:
247                access = "READ_WRITE"
248            connexion_db = self.get_connexion_db()
249            if connexion_db in ":memory:":
250                access = "READ_WRITE"
251            connexion_config["access_mode"] = access
252
253        return connexion_config

The function get_connexion_config returns a dictionary containing the configuration for a connection, including the number of threads and memory limit.

Returns

a dictionary containing the configuration for the Connexion library.

def get_duckdb_settings(self) -> dict:
255    def get_duckdb_settings(self) -> dict:
256        """
257        The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a
258        string.
259        :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`.
260        """
261
262        # config
263        config = self.get_config()
264
265        # duckdb settings
266        duckdb_settings_dict = {}
267        if config.get("duckdb_settings", None):
268            duckdb_settings = config.get("duckdb_settings")
269            duckdb_settings = full_path(duckdb_settings)
270            # duckdb setting is a file
271            if os.path.exists(duckdb_settings):
272                with open(duckdb_settings) as json_file:
273                    duckdb_settings_dict = yaml.safe_load(json_file)
274            # duckdb settings is a string
275            else:
276                duckdb_settings_dict = json.loads(duckdb_settings)
277
278        return duckdb_settings_dict

The function get_duckdb_settings retrieves DuckDB settings from a configuration file or a string.

Returns

The function get_duckdb_settings returns a dictionary object duckdb_settings_dict.

def set_connexion_db(self) -> str:
280    def set_connexion_db(self) -> str:
281        """
282        The function `set_connexion_db` returns the appropriate database connection string based on the
283        input format and connection type.
284        :return: the value of the variable `connexion_db`.
285        """
286
287        # Default connexion db
288        default_connexion_db = ":memory:"
289
290        # Find connexion db
291        if self.get_input_format() in ["db", "duckdb"]:
292            connexion_db = self.get_input()
293        elif self.get_connexion_type() in ["memory", default_connexion_db, None]:
294            connexion_db = default_connexion_db
295        elif self.get_connexion_type() in ["tmpfile"]:
296            tmp_name = tempfile.mkdtemp(
297                prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db"
298            )
299            connexion_db = f"{tmp_name}/tmp.db"
300        elif self.get_connexion_type() != "":
301            connexion_db = self.get_connexion_type()
302        else:
303            connexion_db = default_connexion_db
304
305        # Set connexion db
306        self.connexion_db = connexion_db
307
308        return connexion_db

The function set_connexion_db returns the appropriate database connection string based on the input format and connection type.

Returns

the value of the variable connexion_db.

def set_connexion(self, conn) -> None:
310    def set_connexion(self, conn) -> None:
311        """
312        The function `set_connexion` creates a connection to a database, with options for different
313        database formats and settings.
314
315        :param conn: The `conn` parameter in the `set_connexion` method is the connection to the
316        database. If a connection is not provided, a new connection to an in-memory database is created.
317        The method then proceeds to set up the connection based on the specified format (e.g., duckdb or
318        sqlite
319        """
320
321        # Connexion db
322        connexion_db = self.set_connexion_db()
323
324        # Connexion config
325        connexion_config = self.get_connexion_config()
326
327        # Connexion format
328        connexion_format = self.get_config().get("connexion_format", "duckdb")
329        # Set connexion format
330        self.connexion_format = connexion_format
331
332        # Connexion
333        if not conn:
334            if connexion_format in ["duckdb"]:
335                conn = duckdb.connect(connexion_db, config=connexion_config)
336                # duckDB settings
337                duckdb_settings = self.get_duckdb_settings()
338                if duckdb_settings:
339                    for setting in duckdb_settings:
340                        setting_value = duckdb_settings.get(setting)
341                        if isinstance(setting_value, str):
342                            setting_value = f"'{setting_value}'"
343                        conn.execute(f"PRAGMA {setting}={setting_value};")
344            elif connexion_format in ["sqlite"]:
345                conn = sqlite3.connect(connexion_db)
346
347        # Set connexion
348        self.conn = conn
349
350        # Log
351        log.debug(f"connexion_format: {connexion_format}")
352        log.debug(f"connexion_db: {connexion_db}")
353        log.debug(f"connexion config: {connexion_config}")
354        log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}")

The function set_connexion creates a connection to a database, with options for different database formats and settings.

Parameters
  • conn: The conn parameter in the set_connexion method is the connection to the database. If a connection is not provided, a new connection to an in-memory database is created. The method then proceeds to set up the connection based on the specified format (e.g., duckdb or sqlite
def set_output(self, output: str = None) -> None:
356    def set_output(self, output: str = None) -> None:
357        """
358        The `set_output` function in Python sets the output file based on the input or a specified key
359        in the config file, extracting the output name, extension, and format.
360
361        :param output: The `output` parameter in the `set_output` method is used to specify the name of
362        the output file. If the config file has an 'output' key, the method sets the output to the value
363        of that key. If no output is provided, it sets the output to `None`
364        :type output: str
365        """
366
367        if output and not isinstance(output, str):
368            self.output = output.name
369        else:
370            self.output = output
371
372        # Output format
373        if self.output:
374            output_name, output_extension = os.path.splitext(self.output)
375            self.output_name = output_name
376            self.output_extension = output_extension
377            self.output_format = self.output_extension.replace(".", "")
378        else:
379            self.output_name = None
380            self.output_extension = None
381            self.output_format = None

The set_output function in Python sets the output file based on the input or a specified key in the config file, extracting the output name, extension, and format.

Parameters
  • output: The output parameter in the set_output method is used to specify the name of the output file. If the config file has an 'output' key, the method sets the output to the value of that key. If no output is provided, it sets the output to None
def set_header(self) -> None:
383    def set_header(self) -> None:
384        """
385        It reads the header of a VCF file and stores it as a list of strings and as a VCF object
386        """
387
388        input_file = self.get_input()
389        default_header_list = [
390            "##fileformat=VCFv4.2",
391            "#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO",
392        ]
393
394        # Full path
395        input_file = full_path(input_file)
396
397        if input_file:
398
399            input_format = self.get_input_format()
400            input_compressed = self.get_input_compressed()
401            config = self.get_config()
402            header_list = default_header_list
403            if input_format in [
404                "vcf",
405                "hdr",
406                "tsv",
407                "csv",
408                "psv",
409                "parquet",
410                "db",
411                "duckdb",
412            ]:
413                # header provided in param
414                if config.get("header_file", None):
415                    with open(config.get("header_file"), "rt") as f:
416                        header_list = self.read_vcf_header(f)
417                # within a vcf file format (header within input file itsself)
418                elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file):
419                    # within a compressed vcf file format (.vcf.gz)
420                    if input_compressed:
421                        with bgzf.open(input_file, "rt") as f:
422                            header_list = self.read_vcf_header(f)
423                    # within an uncompressed vcf file format (.vcf)
424                    else:
425                        with open(input_file, "rt") as f:
426                            header_list = self.read_vcf_header(f)
427                # header provided in default external file .hdr
428                elif os.path.exists((input_file + ".hdr")):
429                    with open(input_file + ".hdr", "rt") as f:
430                        header_list = self.read_vcf_header(f)
431                else:
432                    try:  # Try to get header info fields and file columns
433
434                        with tempfile.TemporaryDirectory() as tmpdir:
435
436                            # Create database
437                            db_for_header = Database(database=input_file)
438
439                            # Get header columns for infos fields
440                            db_header_from_columns = (
441                                db_for_header.get_header_from_columns()
442                            )
443
444                            # Get real columns in the file
445                            db_header_columns = db_for_header.get_columns()
446
447                            # Write header file
448                            header_file_tmp = os.path.join(tmpdir, "header")
449                            f = open(header_file_tmp, "w")
450                            vcf.Writer(f, db_header_from_columns)
451                            f.close()
452
453                            # Replace #CHROM line with rel columns
454                            header_list = db_for_header.read_header_file(
455                                header_file=header_file_tmp
456                            )
457                            header_list[-1] = "\t".join(db_header_columns)
458
459                    except:
460
461                        log.warning(
462                            f"No header for file {input_file}. Set as default VCF header"
463                        )
464                        header_list = default_header_list
465
466            else:  # try for unknown format ?
467
468                log.error(f"Input file format '{input_format}' not available")
469                raise ValueError(f"Input file format '{input_format}' not available")
470
471            if not header_list:
472                header_list = default_header_list
473
474            # header as list
475            self.header_list = header_list
476
477            # header as VCF object
478            self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list)))
479
480        else:
481
482            self.header_list = None
483            self.header_vcf = None

It reads the header of a VCF file and stores it as a list of strings and as a VCF object

def get_query_to_df(self, query: str = '', limit: int = None) -> pandas.core.frame.DataFrame:
485    def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame:
486        """
487        The `get_query_to_df` function takes a query as a string and returns the result as a pandas
488        DataFrame based on the connection format.
489
490        :param query: The `query` parameter in the `get_query_to_df` function is a string that
491        represents the SQL query you want to execute. This query will be used to fetch data from a
492        database and convert it into a pandas DataFrame
493        :type query: str
494        :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the
495        maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the
496        function will only fetch up to that number of rows from the database query result. If no limit
497        is specified,
498        :type limit: int
499        :return: A pandas DataFrame is being returned by the `get_query_to_df` function.
500        """
501
502        # Connexion format
503        connexion_format = self.get_connexion_format()
504
505        # Limit in query
506        if limit:
507            pd.set_option("display.max_rows", limit)
508            if connexion_format in ["duckdb"]:
509                df = (
510                    self.conn.execute(query)
511                    .fetch_record_batch(limit)
512                    .read_next_batch()
513                    .to_pandas()
514                )
515            elif connexion_format in ["sqlite"]:
516                df = next(pd.read_sql_query(query, self.conn, chunksize=limit))
517
518        # Full query
519        else:
520            if connexion_format in ["duckdb"]:
521                df = self.conn.execute(query).df()
522            elif connexion_format in ["sqlite"]:
523                df = pd.read_sql_query(query, self.conn)
524
525        return df

The get_query_to_df function takes a query as a string and returns the result as a pandas DataFrame based on the connection format.

Parameters
  • query: The query parameter in the get_query_to_df function is a string that represents the SQL query you want to execute. This query will be used to fetch data from a database and convert it into a pandas DataFrame
  • limit: The limit parameter in the get_query_to_df function is used to specify the maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the function will only fetch up to that number of rows from the database query result. If no limit is specified,
Returns

A pandas DataFrame is being returned by the get_query_to_df function.

def get_overview(self) -> None:
527    def get_overview(self) -> None:
528        """
529        The function prints the input, output, config, and dataframe of the current object
530        """
531        table_variants_from = self.get_table_variants(clause="from")
532        sql_columns = self.get_header_columns_as_sql()
533        sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}"
534        df = self.get_query_to_df(sql_query_export)
535        log.info(
536            "Input:  "
537            + str(self.get_input())
538            + " ["
539            + str(str(self.get_input_format()))
540            + "]"
541        )
542        log.info(
543            "Output: "
544            + str(self.get_output())
545            + " ["
546            + str(str(self.get_output_format()))
547            + "]"
548        )
549        log.info("Config: ")
550        for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split(
551            "\n"
552        ):
553            log.info("\t" + str(d))
554        log.info("Param: ")
555        for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split(
556            "\n"
557        ):
558            log.info("\t" + str(d))
559        log.info("Sample list: " + str(self.get_header_sample_list()))
560        log.info("Dataframe: ")
561        for d in str(df).split("\n"):
562            log.info("\t" + str(d))
563
564        # garbage collector
565        del df
566        gc.collect()
567
568        return None

The function prints the input, output, config, and dataframe of the current object

def get_stats(self) -> dict:
570    def get_stats(self) -> dict:
571        """
572        The `get_stats` function calculates and returns various statistics of the current object,
573        including information about the input file, variants, samples, header fields, quality, and
574        SNVs/InDels.
575        :return: a dictionary containing various statistics of the current object. The dictionary has
576        the following structure:
577        """
578
579        # Log
580        log.info(f"Stats Calculation...")
581
582        # table varaints
583        table_variants_from = self.get_table_variants()
584
585        # stats dict
586        stats = {"Infos": {}}
587
588        ### File
589        input_file = self.get_input()
590        stats["Infos"]["Input file"] = input_file
591
592        # Header
593        header_infos = self.get_header().infos
594        header_formats = self.get_header().formats
595        header_infos_list = list(header_infos)
596        header_formats_list = list(header_formats)
597
598        ### Variants
599
600        stats["Variants"] = {}
601
602        # Variants by chr
603        sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"'
604        df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom)
605        nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values(
606            by=["CHROM"], kind="quicksort"
607        )
608
609        # Total number of variants
610        nb_of_variants = nb_of_variants_by_chrom["count"].sum()
611
612        # Calculate percentage
613        nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply(
614            lambda x: (x / nb_of_variants)
615        )
616
617        stats["Variants"]["Number of variants by chromosome"] = (
618            nb_of_variants_by_chrom.to_dict(orient="index")
619        )
620
621        stats["Infos"]["Number of variants"] = int(nb_of_variants)
622
623        ### Samples
624
625        # Init
626        samples = {}
627        nb_of_samples = 0
628
629        # Check Samples
630        if "GT" in header_formats_list and "FORMAT" in self.get_header_columns():
631            log.debug(f"Check samples...")
632            for sample in self.get_header_sample_list():
633                sql_query_samples = f"""
634                    SELECT  '{sample}' as sample,
635                            REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype,
636                            count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count,
637                            concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage
638                    FROM {table_variants_from}
639                    WHERE (
640                        regexp_matches("{sample}", '^[0-9]([/|][0-9])+')
641                        AND
642                        len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':'))
643                      )
644                    GROUP BY genotype
645                    """
646                sql_query_genotype_df = self.conn.execute(sql_query_samples).df()
647                sample_genotype_count = sql_query_genotype_df["count"].sum()
648                if len(sql_query_genotype_df):
649                    nb_of_samples += 1
650                    samples[f"{sample} - {sample_genotype_count} variants"] = (
651                        sql_query_genotype_df.to_dict(orient="index")
652                    )
653
654            stats["Samples"] = samples
655            stats["Infos"]["Number of samples"] = nb_of_samples
656
657        # #
658        # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list:
659        #     stats["Infos"]["Number of samples"] = nb_of_samples
660        # elif nb_of_samples:
661        #     stats["Infos"]["Number of samples"] = "not a VCF format"
662
663        ### INFO and FORMAT fields
664        header_types_df = {}
665        header_types_list = {
666            "List of INFO fields": header_infos,
667            "List of FORMAT fields": header_formats,
668        }
669        i = 0
670        for header_type in header_types_list:
671
672            header_type_infos = header_types_list.get(header_type)
673            header_infos_dict = {}
674
675            for info in header_type_infos:
676
677                i += 1
678                header_infos_dict[i] = {}
679
680                # ID
681                header_infos_dict[i]["id"] = info
682
683                # num
684                genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"}
685                if header_type_infos[info].num in genotype_map.keys():
686                    header_infos_dict[i]["Number"] = genotype_map.get(
687                        header_type_infos[info].num
688                    )
689                else:
690                    header_infos_dict[i]["Number"] = header_type_infos[info].num
691
692                # type
693                if header_type_infos[info].type:
694                    header_infos_dict[i]["Type"] = header_type_infos[info].type
695                else:
696                    header_infos_dict[i]["Type"] = "."
697
698                # desc
699                if header_type_infos[info].desc != None:
700                    header_infos_dict[i]["Description"] = header_type_infos[info].desc
701                else:
702                    header_infos_dict[i]["Description"] = ""
703
704            if len(header_infos_dict):
705                header_types_df[header_type] = pd.DataFrame.from_dict(
706                    header_infos_dict, orient="index"
707                ).to_dict(orient="index")
708
709        # Stats
710        stats["Infos"]["Number of INFO fields"] = len(header_infos_list)
711        stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list)
712        stats["Header"] = header_types_df
713
714        ### QUAL
715        if "QUAL" in self.get_header_columns():
716            sql_query_qual = f"""
717                    SELECT
718                        avg(CAST(QUAL AS INTEGER)) AS Average,
719                        min(CAST(QUAL AS INTEGER)) AS Minimum,
720                        max(CAST(QUAL AS INTEGER)) AS Maximum,
721                        stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation,
722                        median(CAST(QUAL AS INTEGER)) AS Median,
723                        variance(CAST(QUAL AS INTEGER)) AS Variance
724                    FROM {table_variants_from}
725                    WHERE CAST(QUAL AS VARCHAR) NOT IN ('.')
726                    """
727
728            qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index")
729            stats["Quality"] = {"Stats": qual}
730
731        ### SNV and InDel
732
733        sql_query_snv = f"""
734            
735            SELECT Type, count FROM (
736
737                    SELECT
738                        'Total' AS Type,
739                        count(*) AS count
740                    FROM {table_variants_from}
741
742                    UNION
743
744                    SELECT
745                        'MNV' AS Type,
746                        count(*) AS count
747                    FROM {table_variants_from}
748                    WHERE len(REF) > 1 AND len(ALT) > 1
749                    AND len(REF) = len(ALT)
750
751                    UNION
752
753                    SELECT
754                        'InDel' AS Type,
755                        count(*) AS count
756                    FROM {table_variants_from}
757                    WHERE len(REF) > 1 OR len(ALT) > 1
758                    AND len(REF) != len(ALT)
759                    
760                    UNION
761
762                    SELECT
763                        'SNV' AS Type,
764                        count(*) AS count
765                    FROM {table_variants_from}
766                    WHERE len(REF) = 1 AND len(ALT) = 1
767
768                )
769
770            ORDER BY count DESC
771
772                """
773        snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index")
774
775        sql_query_snv_substitution = f"""
776                SELECT
777                    concat(REF, '>', ALT) AS 'Substitution',
778                    count(*) AS count
779                FROM {table_variants_from}
780                WHERE len(REF) = 1 AND len(ALT) = 1
781                GROUP BY REF, ALT
782                ORDER BY count(*) DESC
783                """
784        snv_substitution = (
785            self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index")
786        )
787        stats["Variants"]["Counts"] = snv_indel
788        stats["Variants"]["Substitutions"] = snv_substitution
789
790        return stats

The get_stats function calculates and returns various statistics of the current object, including information about the input file, variants, samples, header fields, quality, and SNVs/InDels.

Returns

a dictionary containing various statistics of the current object. The dictionary has the following structure:

def stats_to_file(self, file: str = None) -> str:
792    def stats_to_file(self, file: str = None) -> str:
793        """
794        The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them
795        into a JSON object, and writes the JSON object to the specified file.
796
797        :param file: The `file` parameter is a string that represents the file path where the JSON data
798        will be written
799        :type file: str
800        :return: the name of the file that was written to.
801        """
802
803        # Get stats
804        stats = self.get_stats()
805
806        # Serializing json
807        json_object = json.dumps(stats, indent=4)
808
809        # Writing to sample.json
810        with open(file, "w") as outfile:
811            outfile.write(json_object)
812
813        return file

The function stats_to_file takes a file name as input, retrieves statistics, serializes them into a JSON object, and writes the JSON object to the specified file.

Parameters
  • file: The file parameter is a string that represents the file path where the JSON data will be written
Returns

the name of the file that was written to.

def print_stats(self, output_file: str = None, json_file: str = None) -> None:
815    def print_stats(self, output_file: str = None, json_file: str = None) -> None:
816        """
817        The `print_stats` function generates a markdown file and prints the statistics contained in a
818        JSON file in a formatted manner.
819
820        :param output_file: The `output_file` parameter is a string that specifies the path and filename
821        of the output file where the stats will be printed in Markdown format. If no `output_file` is
822        provided, a temporary directory will be created and the stats will be saved in a file named
823        "stats.md" within that
824        :type output_file: str
825        :param json_file: The `json_file` parameter is a string that represents the path to the JSON
826        file where the statistics will be saved. If no value is provided, a temporary directory will be
827        created and a default file name "stats.json" will be used
828        :type json_file: str
829        :return: The function `print_stats` does not return any value. It has a return type annotation
830        of `None`.
831        """
832
833        # Full path
834        output_file = full_path(output_file)
835        json_file = full_path(json_file)
836
837        with tempfile.TemporaryDirectory() as tmpdir:
838
839            # Files
840            if not output_file:
841                output_file = os.path.join(tmpdir, "stats.md")
842            if not json_file:
843                json_file = os.path.join(tmpdir, "stats.json")
844
845            # Create folders
846            if not os.path.exists(os.path.dirname(output_file)):
847                Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True)
848            if not os.path.exists(os.path.dirname(json_file)):
849                Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True)
850
851            # Create stats JSON file
852            stats_file = self.stats_to_file(file=json_file)
853
854            # Print stats file
855            with open(stats_file) as f:
856                stats = yaml.safe_load(f)
857
858            # Output
859            output_title = []
860            output_index = []
861            output = []
862
863            # Title
864            output_title.append("# HOWARD Stats")
865
866            # Index
867            output_index.append("## Index")
868
869            # Process sections
870            for section in stats:
871                infos = stats.get(section)
872                section_link = "#" + section.lower().replace(" ", "-")
873                output.append(f"## {section}")
874                output_index.append(f"- [{section}]({section_link})")
875
876                if len(infos):
877                    for info in infos:
878                        try:
879                            df = pd.DataFrame.from_dict(infos.get(info), orient="index")
880                            is_df = True
881                        except:
882                            try:
883                                df = pd.DataFrame.from_dict(
884                                    json.loads((infos.get(info))), orient="index"
885                                )
886                                is_df = True
887                            except:
888                                is_df = False
889                        if is_df:
890                            output.append(f"### {info}")
891                            info_link = "#" + info.lower().replace(" ", "-")
892                            output_index.append(f"   - [{info}]({info_link})")
893                            output.append(f"{df.to_markdown(index=False)}")
894                        else:
895                            output.append(f"- {info}: {infos.get(info)}")
896                else:
897                    output.append(f"NA")
898
899            # Write stats in markdown file
900            with open(output_file, "w") as fp:
901                for item in output_title:
902                    fp.write("%s\n" % item)
903                for item in output_index:
904                    fp.write("%s\n" % item)
905                for item in output:
906                    fp.write("%s\n" % item)
907
908            # Output stats in markdown
909            print("")
910            print("\n\n".join(output_title))
911            print("")
912            print("\n\n".join(output))
913            print("")
914
915        return None

The print_stats function generates a markdown file and prints the statistics contained in a JSON file in a formatted manner.

Parameters
  • output_file: The output_file parameter is a string that specifies the path and filename of the output file where the stats will be printed in Markdown format. If no output_file is provided, a temporary directory will be created and the stats will be saved in a file named "stats.md" within that
  • json_file: The json_file parameter is a string that represents the path to the JSON file where the statistics will be saved. If no value is provided, a temporary directory will be created and a default file name "stats.json" will be used
Returns

The function print_stats does not return any value. It has a return type annotation of None.

def get_input(self) -> str:
917    def get_input(self) -> str:
918        """
919        It returns the value of the input variable.
920        :return: The input is being returned.
921        """
922        return self.input

It returns the value of the input variable.

Returns

The input is being returned.

def get_input_format(self, input_file: str = None) -> str:
924    def get_input_format(self, input_file: str = None) -> str:
925        """
926        This function returns the format of the input variable, either from the provided input file or
927        by prompting for input.
928
929        :param input_file: The `input_file` parameter in the `get_input_format` method is a string that
930        represents the file path of the input file. If no `input_file` is provided when calling the
931        method, it will default to `None`
932        :type input_file: str
933        :return: The format of the input variable is being returned.
934        """
935
936        if not input_file:
937            input_file = self.get_input()
938        input_format = get_file_format(input_file)
939        return input_format

This function returns the format of the input variable, either from the provided input file or by prompting for input.

Parameters
  • input_file: The input_file parameter in the get_input_format method is a string that represents the file path of the input file. If no input_file is provided when calling the method, it will default to None
Returns

The format of the input variable is being returned.

def get_input_compressed(self, input_file: str = None) -> str:
941    def get_input_compressed(self, input_file: str = None) -> str:
942        """
943        The function `get_input_compressed` returns the format of the input variable after compressing
944        it.
945
946        :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string
947        that represents the file path of the input file. If no `input_file` is provided when calling the
948        method, it will default to `None` and the method will then call `self.get_input()` to
949        :type input_file: str
950        :return: The function `get_input_compressed` returns the compressed format of the input
951        variable.
952        """
953
954        if not input_file:
955            input_file = self.get_input()
956        input_compressed = get_file_compressed(input_file)
957        return input_compressed

The function get_input_compressed returns the format of the input variable after compressing it.

Parameters
  • input_file: The input_file parameter in the get_input_compressed method is a string that represents the file path of the input file. If no input_file is provided when calling the method, it will default to None and the method will then call self.get_input() to
Returns

The function get_input_compressed returns the compressed format of the input variable.

def get_output(self) -> str:
959    def get_output(self) -> str:
960        """
961        It returns the output of the neuron.
962        :return: The output of the neural network.
963        """
964
965        return self.output

It returns the output of the neuron.

Returns

The output of the neural network.

def get_output_format(self, output_file: str = None) -> str:
967    def get_output_format(self, output_file: str = None) -> str:
968        """
969        The function `get_output_format` returns the format of the input variable or the output file if
970        provided.
971
972        :param output_file: The `output_file` parameter in the `get_output_format` method is a string
973        that represents the file path of the output file. If no `output_file` is provided when calling
974        the method, it will default to the output obtained from the `get_output` method of the class
975        instance. The
976        :type output_file: str
977        :return: The format of the input variable is being returned.
978        """
979
980        if not output_file:
981            output_file = self.get_output()
982        output_format = get_file_format(output_file)
983
984        return output_format

The function get_output_format returns the format of the input variable or the output file if provided.

Parameters
  • output_file: The output_file parameter in the get_output_format method is a string that represents the file path of the output file. If no output_file is provided when calling the method, it will default to the output obtained from the get_output method of the class instance. The
Returns

The format of the input variable is being returned.

def get_config(self) -> dict:
986    def get_config(self) -> dict:
987        """
988        It returns the config
989        :return: The config variable is being returned.
990        """
991        return self.config

It returns the config

Returns

The config variable is being returned.

def get_param(self) -> dict:
993    def get_param(self) -> dict:
994        """
995        It returns the param
996        :return: The param variable is being returned.
997        """
998        return self.param

It returns the param

Returns

The param variable is being returned.

def get_connexion_db(self) -> str:
1000    def get_connexion_db(self) -> str:
1001        """
1002        It returns the connexion_db attribute of the object
1003        :return: The connexion_db is being returned.
1004        """
1005        return self.connexion_db

It returns the connexion_db attribute of the object

Returns

The connexion_db is being returned.

def get_prefix(self) -> str:
1007    def get_prefix(self) -> str:
1008        """
1009        It returns the prefix of the object.
1010        :return: The prefix is being returned.
1011        """
1012        return self.prefix

It returns the prefix of the object.

Returns

The prefix is being returned.

def get_table_variants(self, clause: str = 'select') -> str:
1014    def get_table_variants(self, clause: str = "select") -> str:
1015        """
1016        This function returns the table_variants attribute of the object
1017
1018        :param clause: the type of clause the table will be used. Either "select" or "from" (optional),
1019        defaults to select (optional)
1020        :return: The table_variants attribute of the object.
1021        """
1022
1023        # Access
1024        access = self.get_config().get("access", None)
1025
1026        # Clauses "select", "where", "update"
1027        if clause in ["select", "where", "update"]:
1028            table_variants = self.table_variants
1029        # Clause "from"
1030        elif clause in ["from"]:
1031            # For Read Only
1032            if self.get_input_format() in ["parquet"] and access in ["RO"]:
1033                input_file = self.get_input()
1034                table_variants = f"'{input_file}' as variants"
1035            # For Read Write
1036            else:
1037                table_variants = f"{self.table_variants} as variants"
1038        else:
1039            table_variants = self.table_variants
1040        return table_variants

This function returns the table_variants attribute of the object

Parameters
  • clause: the type of clause the table will be used. Either "select" or "from" (optional), defaults to select (optional)
Returns

The table_variants attribute of the object.

def get_tmp_dir(self) -> str:
1042    def get_tmp_dir(self) -> str:
1043        """
1044        The function `get_tmp_dir` returns the temporary directory path based on configuration
1045        parameters or a default path.
1046        :return: The `get_tmp_dir` method is returning the temporary directory path based on the
1047        configuration, parameters, and a default value of "/tmp".
1048        """
1049
1050        return get_tmp(
1051            config=self.get_config(), param=self.get_param(), default_tmp="/tmp"
1052        )

The function get_tmp_dir returns the temporary directory path based on configuration parameters or a default path.

Returns

The get_tmp_dir method is returning the temporary directory path based on the configuration, parameters, and a default value of "/tmp".

def get_connexion_type(self) -> str:
1054    def get_connexion_type(self) -> str:
1055        """
1056        If the connexion type is not in the list of allowed connexion types, raise a ValueError
1057
1058        :return: The connexion type is being returned.
1059        """
1060        return self.get_config().get("connexion_type", "memory")

If the connexion type is not in the list of allowed connexion types, raise a ValueError

Returns

The connexion type is being returned.

def get_connexion(self):
1062    def get_connexion(self):
1063        """
1064        It returns the connection object
1065
1066        :return: The connection object.
1067        """
1068        return self.conn

It returns the connection object

Returns

The connection object.

def close_connexion(self) -> None:
1070    def close_connexion(self) -> None:
1071        """
1072        This function closes the connection to the database.
1073        :return: The connection is being closed.
1074        """
1075        return self.conn.close()

This function closes the connection to the database.

Returns

The connection is being closed.

def get_header(self, type: str = 'vcf'):
1077    def get_header(self, type: str = "vcf"):
1078        """
1079        This function returns the header of the VCF file as a list of strings
1080
1081        :param type: the type of header you want to get, defaults to vcf (optional)
1082        :return: The header of the vcf file.
1083        """
1084
1085        if self.header_vcf:
1086            if type == "vcf":
1087                return self.header_vcf
1088            elif type == "list":
1089                return self.header_list
1090        else:
1091            if type == "vcf":
1092                header = vcf.Reader(io.StringIO("\n".join(vcf_required)))
1093                return header
1094            elif type == "list":
1095                return vcf_required

This function returns the header of the VCF file as a list of strings

Parameters
  • type: the type of header you want to get, defaults to vcf (optional)
Returns

The header of the vcf file.

def get_header_infos_list(self) -> list:
1097    def get_header_infos_list(self) -> list:
1098        """
1099        This function retrieves a list of information fields from the header.
1100        :return: A list of information fields from the header.
1101        """
1102
1103        # Init
1104        infos_list = []
1105
1106        for field in self.get_header().infos:
1107            infos_list.append(field)
1108
1109        return infos_list

This function retrieves a list of information fields from the header.

Returns

A list of information fields from the header.

def get_header_length(self, file: str = None) -> int:
1111    def get_header_length(self, file: str = None) -> int:
1112        """
1113        The function `get_header_length` returns the length of the header list, excluding the #CHROM
1114        line.
1115
1116        :param file: The `file` parameter is an optional argument that specifies the path to a VCF
1117        header file. If this argument is provided, the function will read the header from the specified
1118        file and return the length of the header list minus 1 (to exclude the #CHROM line)
1119        :type file: str
1120        :return: the length of the header list, excluding the #CHROM line.
1121        """
1122
1123        if file:
1124            return len(self.read_vcf_header_file(file=file)) - 1
1125        elif self.get_header(type="list"):
1126            return len(self.get_header(type="list")) - 1
1127        else:
1128            return 0

The function get_header_length returns the length of the header list, excluding the #CHROM line.

Parameters
  • file: The file parameter is an optional argument that specifies the path to a VCF header file. If this argument is provided, the function will read the header from the specified file and return the length of the header list minus 1 (to exclude the #CHROM line)
Returns

the length of the header list, excluding the #CHROM line.

def get_header_columns(self) -> str:
1130    def get_header_columns(self) -> str:
1131        """
1132        This function returns the header list of a VCF
1133
1134        :return: The length of the header list.
1135        """
1136        if self.get_header():
1137            return self.get_header(type="list")[-1]
1138        else:
1139            return ""

This function returns the header list of a VCF

Returns

The length of the header list.

def get_header_columns_as_list(self) -> list:
1141    def get_header_columns_as_list(self) -> list:
1142        """
1143        This function returns the header list of a VCF
1144
1145        :return: The length of the header list.
1146        """
1147        if self.get_header():
1148            return self.get_header_columns().strip().split("\t")
1149        else:
1150            return []

This function returns the header list of a VCF

Returns

The length of the header list.

def get_header_columns_as_sql(self) -> str:
1152    def get_header_columns_as_sql(self) -> str:
1153        """
1154        This function retruns header length (without #CHROM line)
1155
1156        :return: The length of the header list.
1157        """
1158        sql_column_list = []
1159        for col in self.get_header_columns_as_list():
1160            sql_column_list.append(f'"{col}"')
1161        return ",".join(sql_column_list)

This function retruns header length (without #CHROM line)

Returns

The length of the header list.

def get_header_sample_list( self, check: bool = False, samples: list = None, samples_force: bool = False) -> list:
1163    def get_header_sample_list(
1164        self, check: bool = False, samples: list = None, samples_force: bool = False
1165    ) -> list:
1166        """
1167        The function `get_header_sample_list` returns a list of samples from a VCF header, with optional
1168        checking and filtering based on input parameters.
1169
1170        :param check: The `check` parameter in the `get_header_sample_list` function is a boolean
1171        parameter that determines whether to check if the samples in the list are properly defined as
1172        genotype columns. If `check` is set to `True`, the function will verify if each sample in the
1173        list is defined as a, defaults to False
1174        :type check: bool (optional)
1175        :param samples: The `samples` parameter in the `get_header_sample_list` function is a list that
1176        allows you to specify a subset of samples from the header. If you provide a list of sample
1177        names, the function will check if each sample is defined in the header. If a sample is not found
1178        in the
1179        :type samples: list
1180        :param samples_force: The `samples_force` parameter in the `get_header_sample_list` function is
1181        a boolean parameter that determines whether to force the function to return the sample list
1182        without checking if the samples are genotype columns. If `samples_force` is set to `True`, the
1183        function will return the sample list without performing, defaults to False
1184        :type samples_force: bool (optional)
1185        :return: The function `get_header_sample_list` returns a list of samples based on the input
1186        parameters and conditions specified in the function.
1187        """
1188
1189        # Init
1190        samples_list = []
1191
1192        if samples is None:
1193            samples_list = self.header_vcf.samples
1194        else:
1195            samples_checked = []
1196            for sample in samples:
1197                if sample in self.header_vcf.samples:
1198                    samples_checked.append(sample)
1199                else:
1200                    log.warning(f"Sample '{sample}' not defined in header")
1201            samples_list = samples_checked
1202
1203            # Force sample list without checking if is_genotype_column
1204            if samples_force:
1205                log.warning(f"Samples {samples_list} not checked if genotypes")
1206                return samples_list
1207
1208        if check:
1209            samples_checked = []
1210            for sample in samples_list:
1211                if self.is_genotype_column(column=sample):
1212                    samples_checked.append(sample)
1213                else:
1214                    log.warning(
1215                        f"Sample '{sample}' not defined as a sample (genotype not well defined)"
1216                    )
1217            samples_list = samples_checked
1218
1219        # Return samples list
1220        return samples_list

The function get_header_sample_list returns a list of samples from a VCF header, with optional checking and filtering based on input parameters.

Parameters
  • check: The check parameter in the get_header_sample_list function is a boolean parameter that determines whether to check if the samples in the list are properly defined as genotype columns. If check is set to True, the function will verify if each sample in the list is defined as a, defaults to False
  • samples: The samples parameter in the get_header_sample_list function is a list that allows you to specify a subset of samples from the header. If you provide a list of sample names, the function will check if each sample is defined in the header. If a sample is not found in the
  • samples_force: The samples_force parameter in the get_header_sample_list function is a boolean parameter that determines whether to force the function to return the sample list without checking if the samples are genotype columns. If samples_force is set to True, the function will return the sample list without performing, defaults to False
Returns

The function get_header_sample_list returns a list of samples based on the input parameters and conditions specified in the function.

def is_genotype_column(self, column: str = None) -> bool:
1222    def is_genotype_column(self, column: str = None) -> bool:
1223        """
1224        This function checks if a given column is a genotype column in a database.
1225
1226        :param column: The `column` parameter in the `is_genotype_column` method is a string that
1227        represents the column name in a database table. This method checks if the specified column is a
1228        genotype column in the database. If a column name is provided, it calls the `is_genotype_column`
1229        method of
1230        :type column: str
1231        :return: The `is_genotype_column` method is returning a boolean value. If the `column` parameter
1232        is not None, it calls the `is_genotype_column` method of the `Database` class with the specified
1233        column name and returns the result. If the `column` parameter is None, it returns False.
1234        """
1235
1236        if column is not None:
1237            return Database(database=self.get_input()).is_genotype_column(column=column)
1238        else:
1239            return False

This function checks if a given column is a genotype column in a database.

Parameters
  • column: The column parameter in the is_genotype_column method is a string that represents the column name in a database table. This method checks if the specified column is a genotype column in the database. If a column name is provided, it calls the is_genotype_column method of
Returns

The is_genotype_column method is returning a boolean value. If the column parameter is not None, it calls the is_genotype_column method of the Database class with the specified column name and returns the result. If the column parameter is None, it returns False.

def get_verbose(self) -> bool:
1241    def get_verbose(self) -> bool:
1242        """
1243        It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't
1244        exist
1245
1246        :return: The value of the key "verbose" in the config dictionary.
1247        """
1248        return self.get_config().get("verbose", False)

It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't exist

Returns

The value of the key "verbose" in the config dictionary.

def get_connexion_format(self) -> str:
1250    def get_connexion_format(self) -> str:
1251        """
1252        It returns the connexion format of the object.
1253        :return: The connexion_format is being returned.
1254        """
1255        connexion_format = self.connexion_format
1256        if connexion_format not in ["duckdb", "sqlite"]:
1257            log.error(f"Unknown connexion format {connexion_format}")
1258            raise ValueError(f"Unknown connexion format {connexion_format}")
1259        else:
1260            return connexion_format

It returns the connexion format of the object.

Returns

The connexion_format is being returned.

def insert_file_to_table( self, file, columns: str, header_len: int = 0, sep: str = '\t', chunksize: int = 1000000) -> None:
1262    def insert_file_to_table(
1263        self,
1264        file,
1265        columns: str,
1266        header_len: int = 0,
1267        sep: str = "\t",
1268        chunksize: int = 1000000,
1269    ) -> None:
1270        """
1271        The function reads a file in chunks and inserts each chunk into a table based on the specified
1272        database format.
1273
1274        :param file: The `file` parameter is the file that you want to load into a table. It should be
1275        the path to the file on your system
1276        :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that
1277        should contain the names of the columns in the table where the data will be inserted. The column
1278        names should be separated by commas within the string. For example, if you have columns named
1279        "id", "name
1280        :type columns: str
1281        :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies
1282        the number of lines to skip at the beginning of the file before reading the actual data. This
1283        parameter allows you to skip any header information present in the file before processing the
1284        data, defaults to 0
1285        :type header_len: int (optional)
1286        :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the
1287        separator character that is used in the file being read. In this case, the default separator is
1288        set to `\t`, which represents a tab character. You can change this parameter to a different
1289        separator character if, defaults to \t
1290        :type sep: str (optional)
1291        :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time
1292        when processing the file in chunks. In the provided code snippet, the default value for
1293        `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults
1294        to 1000000
1295        :type chunksize: int (optional)
1296        """
1297
1298        # Config
1299        chunksize = self.get_config().get("load", {}).get("chunk", chunksize)
1300        connexion_format = self.get_connexion_format()
1301
1302        log.debug("chunksize: " + str(chunksize))
1303
1304        if chunksize:
1305            for chunk in pd.read_csv(
1306                file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c"
1307            ):
1308                if connexion_format in ["duckdb"]:
1309                    sql_insert_into = (
1310                        f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk"
1311                    )
1312                    self.conn.execute(sql_insert_into)
1313                elif connexion_format in ["sqlite"]:
1314                    chunk.to_sql("variants", self.conn, if_exists="append", index=False)

The function reads a file in chunks and inserts each chunk into a table based on the specified database format.

Parameters
  • file: The file parameter is the file that you want to load into a table. It should be the path to the file on your system
  • columns: The columns parameter in the insert_file_to_table function is a string that should contain the names of the columns in the table where the data will be inserted. The column names should be separated by commas within the string. For example, if you have columns named "id", "name
  • header_len: The header_len parameter in the insert_file_to_table function specifies the number of lines to skip at the beginning of the file before reading the actual data. This parameter allows you to skip any header information present in the file before processing the data, defaults to 0
  • sep: The sep parameter in the insert_file_to_table function is used to specify the separator character that is used in the file being read. In this case, the default separator is set to , which represents a tab character. You can change this parameter to a different separator character if, defaults to
  • chunksize: The chunksize parameter specifies the number of rows to read in at a time when processing the file in chunks. In the provided code snippet, the default value for chunksize is set to 1000000. This means that the file will be read in chunks of 1,, defaults to 1000000
def load_data( self, input_file: str = None, drop_variants_table: bool = False, sample_size: int = 20480) -> None:
1316    def load_data(
1317        self,
1318        input_file: str = None,
1319        drop_variants_table: bool = False,
1320        sample_size: int = 20480,
1321    ) -> None:
1322        """
1323        The `load_data` function reads a VCF file and inserts it into a table, with options to drop the
1324        table before loading the data and specify a sample size.
1325
1326        :param input_file: The path to the input file. This is the VCF file that will be loaded into the
1327        table
1328        :type input_file: str
1329        :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that
1330        determines whether the variants table should be dropped before loading the data. If set to
1331        `True`, the variants table will be dropped. If set to `False` (default), the variants table will
1332        not be dropped, defaults to False
1333        :type drop_variants_table: bool (optional)
1334        :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from
1335        the input file. If it is set to `None`, the default value of 20480 will be used, defaults to
1336        20480
1337        :type sample_size: int (optional)
1338        """
1339
1340        log.info("Loading...")
1341
1342        # change input file
1343        if input_file:
1344            self.set_input(input_file)
1345            self.set_header()
1346
1347        # drop variants table
1348        if drop_variants_table:
1349            self.drop_variants_table()
1350
1351        # get table variants
1352        table_variants = self.get_table_variants()
1353
1354        # Access
1355        access = self.get_config().get("access", None)
1356        log.debug(f"access: {access}")
1357
1358        # Input format and compress
1359        input_format = self.get_input_format()
1360        input_compressed = self.get_input_compressed()
1361        log.debug(f"input_format: {input_format}")
1362        log.debug(f"input_compressed: {input_compressed}")
1363
1364        # input_compressed_format
1365        if input_compressed:
1366            input_compressed_format = "gzip"
1367        else:
1368            input_compressed_format = "none"
1369        log.debug(f"input_compressed_format: {input_compressed_format}")
1370
1371        # Connexion format
1372        connexion_format = self.get_connexion_format()
1373
1374        # Sample size
1375        if not sample_size:
1376            sample_size = -1
1377        log.debug(f"sample_size: {sample_size}")
1378
1379        # Load data
1380        log.debug(f"Load Data from {input_format}")
1381
1382        # DuckDB connexion
1383        if connexion_format in ["duckdb"]:
1384
1385            # Database already exists
1386            if self.input_format in ["db", "duckdb"]:
1387
1388                if connexion_format in ["duckdb"]:
1389                    log.debug(f"Input file format '{self.input_format}' duckDB")
1390                else:
1391                    log.error(
1392                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
1393                    )
1394                    raise ValueError(
1395                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
1396                    )
1397
1398            # Load from existing database format
1399            else:
1400
1401                try:
1402                    # Create Table or View
1403                    database = Database(database=self.input)
1404                    sql_from = database.get_sql_from(sample_size=sample_size)
1405
1406                    if access in ["RO"]:
1407                        sql_load = (
1408                            f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}"
1409                        )
1410                    else:
1411                        sql_load = (
1412                            f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}"
1413                        )
1414                    self.conn.execute(sql_load)
1415
1416                except:
1417                    # Format not available
1418                    log.error(f"Input file format '{self.input_format}' not available")
1419                    raise ValueError(
1420                        f"Input file format '{self.input_format}' not available"
1421                    )
1422
1423        # SQLite connexion
1424        elif connexion_format in ["sqlite"] and input_format in [
1425            "vcf",
1426            "tsv",
1427            "csv",
1428            "psv",
1429        ]:
1430
1431            # Main structure
1432            structure = {
1433                "#CHROM": "VARCHAR",
1434                "POS": "INTEGER",
1435                "ID": "VARCHAR",
1436                "REF": "VARCHAR",
1437                "ALT": "VARCHAR",
1438                "QUAL": "VARCHAR",
1439                "FILTER": "VARCHAR",
1440                "INFO": "VARCHAR",
1441            }
1442
1443            # Strcuture with samples
1444            structure_complete = structure
1445            if self.get_header_sample_list():
1446                structure["FORMAT"] = "VARCHAR"
1447                for sample in self.get_header_sample_list():
1448                    structure_complete[sample] = "VARCHAR"
1449
1450            # Columns list for create and insert
1451            sql_create_table_columns = []
1452            sql_create_table_columns_list = []
1453            for column in structure_complete:
1454                column_type = structure_complete[column]
1455                sql_create_table_columns.append(
1456                    f'"{column}" {column_type} default NULL'
1457                )
1458                sql_create_table_columns_list.append(f'"{column}"')
1459
1460            # Create database
1461            log.debug(f"Create Table {table_variants}")
1462            sql_create_table_columns_sql = ", ".join(sql_create_table_columns)
1463            sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list)
1464            sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})"
1465            self.conn.execute(sql_create_table)
1466
1467            # chunksize define length of file chunk load file
1468            chunksize = 100000
1469
1470            # delimiter
1471            delimiter = file_format_delimiters.get(input_format, "\t")
1472
1473            # Load the input file
1474            with open(self.input, "rt") as input_file:
1475
1476                # Use the appropriate file handler based on the input format
1477                if input_compressed:
1478                    input_file = bgzf.open(self.input, "rt")
1479                if input_format in ["vcf"]:
1480                    header_len = self.get_header_length()
1481                else:
1482                    header_len = 0
1483
1484                # Insert the file contents into a table
1485                self.insert_file_to_table(
1486                    input_file,
1487                    columns=sql_create_table_columns_list_sql,
1488                    header_len=header_len,
1489                    sep=delimiter,
1490                    chunksize=chunksize,
1491                )
1492
1493        else:
1494            log.error(
1495                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
1496            )
1497            raise ValueError(
1498                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
1499            )
1500
1501        # Explode INFOS fields into table fields
1502        if self.get_explode_infos():
1503            self.explode_infos(
1504                prefix=self.get_explode_infos_prefix(),
1505                fields=self.get_explode_infos_fields(),
1506                force=True,
1507            )
1508
1509        # Create index after insertion
1510        self.create_indexes()

The load_data function reads a VCF file and inserts it into a table, with options to drop the table before loading the data and specify a sample size.

Parameters
  • input_file: The path to the input file. This is the VCF file that will be loaded into the table
  • drop_variants_table: The drop_variants_table parameter is a boolean flag that determines whether the variants table should be dropped before loading the data. If set to True, the variants table will be dropped. If set to False (default), the variants table will not be dropped, defaults to False
  • sample_size: The sample_size parameter determines the number of rows to be sampled from the input file. If it is set to None, the default value of 20480 will be used, defaults to 20480
def get_explode_infos(self) -> bool:
1512    def get_explode_infos(self) -> bool:
1513        """
1514        The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting
1515        to False if it is not set.
1516        :return: The method is returning the value of the "explode_infos" parameter, which is a boolean
1517        value. If the parameter is not present, it will return False.
1518        """
1519
1520        return self.get_param().get("explode", {}).get("explode_infos", False)

The function get_explode_infos returns the value of the "explode_infos" parameter, defaulting to False if it is not set.

Returns

The method is returning the value of the "explode_infos" parameter, which is a boolean value. If the parameter is not present, it will return False.

def get_explode_infos_fields( self, explode_infos_fields: str = None, remove_fields_not_in_header: bool = False) -> list:
1522    def get_explode_infos_fields(
1523        self,
1524        explode_infos_fields: str = None,
1525        remove_fields_not_in_header: bool = False,
1526    ) -> list:
1527        """
1528        The `get_explode_infos_fields` function returns a list of exploded information fields based on
1529        the input parameter `explode_infos_fields`.
1530
1531        :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the
1532        fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a
1533        comma-separated list of field names to explode
1534        :type explode_infos_fields: str
1535        :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean
1536        flag that determines whether to remove fields that are not present in the header. If it is set
1537        to `True`, any field that is not in the header will be excluded from the list of exploded
1538        information fields. If it is set to `, defaults to False
1539        :type remove_fields_not_in_header: bool (optional)
1540        :return: The function `get_explode_infos_fields` returns a list of exploded information fields.
1541        If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty
1542        list. If the parameter is provided and its value is "ALL", it also returns an empty list.
1543        Otherwise, it returns a list of exploded information fields after removing any spaces and
1544        splitting the string by commas.
1545        """
1546
1547        # If no fields, get it in param
1548        if not explode_infos_fields:
1549            explode_infos_fields = (
1550                self.get_param().get("explode", {}).get("explode_infos_fields", None)
1551            )
1552
1553        # If no fields, defined as all fields in header using keyword
1554        if not explode_infos_fields:
1555            explode_infos_fields = "*"
1556
1557        # If fields list not empty
1558        if explode_infos_fields:
1559
1560            # Input fields list
1561            if isinstance(explode_infos_fields, str):
1562                fields_input = explode_infos_fields.split(",")
1563            elif isinstance(explode_infos_fields, list):
1564                fields_input = explode_infos_fields
1565            else:
1566                fields_input = []
1567
1568            # Fields list without * keyword
1569            fields_without_all = fields_input.copy()
1570            if "*".casefold() in (item.casefold() for item in fields_without_all):
1571                fields_without_all.remove("*")
1572
1573            # Fields in header
1574            fields_in_header = sorted(list(set(self.get_header().infos)))
1575
1576            # Construct list of fields
1577            fields_output = []
1578            for field in fields_input:
1579
1580                # Strip field
1581                field = field.strip()
1582
1583                # format keyword * in regex
1584                if field.upper() in ["*"]:
1585                    field = ".*"
1586
1587                # Find all fields with pattern
1588                r = re.compile(field)
1589                fields_search = sorted(list(filter(r.match, fields_in_header)))
1590
1591                # Remove fields input from search
1592                if field in fields_search:
1593                    fields_search = [field]
1594                elif fields_search != [field]:
1595                    fields_search = sorted(
1596                        list(set(fields_search).difference(fields_input))
1597                    )
1598
1599                # If field is not in header (avoid not well formatted header)
1600                if not fields_search and not remove_fields_not_in_header:
1601                    fields_search = [field]
1602
1603                # Add found fields
1604                for new_field in fields_search:
1605                    # Add field, if not already exists, and if it is in header (if asked)
1606                    if (
1607                        new_field not in fields_output
1608                        and (
1609                            not remove_fields_not_in_header
1610                            or new_field in fields_in_header
1611                        )
1612                        and new_field not in [".*"]
1613                    ):
1614                        fields_output.append(new_field)
1615
1616            return fields_output
1617
1618        else:
1619
1620            return []

The get_explode_infos_fields function returns a list of exploded information fields based on the input parameter explode_infos_fields.

Parameters
  • explode_infos_fields: The explode_infos_fields parameter is a string that specifies the fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a comma-separated list of field names to explode
  • remove_fields_not_in_header: The parameter remove_fields_not_in_header is a boolean flag that determines whether to remove fields that are not present in the header. If it is set to True, any field that is not in the header will be excluded from the list of exploded information fields. If it is set to `, defaults to False
Returns

The function get_explode_infos_fields returns a list of exploded information fields. If the explode_infos_fields parameter is not provided or is set to None, it returns an empty list. If the parameter is provided and its value is "ALL", it also returns an empty list. Otherwise, it returns a list of exploded information fields after removing any spaces and splitting the string by commas.

def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str:
1622    def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str:
1623        """
1624        The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or
1625        the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is
1626        not provided.
1627
1628        :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a
1629        prefix to be used for exploding or expanding information
1630        :type explode_infos_prefix: str
1631        :return: the value of the variable `explode_infos_prefix`.
1632        """
1633
1634        if not explode_infos_prefix:
1635            explode_infos_prefix = (
1636                self.get_param().get("explode", {}).get("explode_infos_prefix", "")
1637            )
1638
1639        return explode_infos_prefix

The function get_explode_infos_prefix returns the value of the explode_infos_prefix parameter, or the value of self.get_param().get("explode_infos_prefix", None) if explode_infos_prefix is not provided.

Parameters
  • explode_infos_prefix: The parameter explode_infos_prefix is a string that specifies a prefix to be used for exploding or expanding information
Returns

the value of the variable explode_infos_prefix.

def add_column( self, table_name, column_name, column_type, default_value=None, drop: bool = False) -> dict:
1641    def add_column(
1642        self,
1643        table_name,
1644        column_name,
1645        column_type,
1646        default_value=None,
1647        drop: bool = False,
1648    ) -> dict:
1649        """
1650        The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it
1651        doesn't already exist.
1652
1653        :param table_name: The name of the table to which you want to add a column
1654        :param column_name: The parameter "column_name" is the name of the column that you want to add
1655        to the table
1656        :param column_type: The `column_type` parameter specifies the data type of the column that you
1657        want to add to the table. It should be a string that represents the desired data type, such as
1658        "INTEGER", "TEXT", "REAL", etc
1659        :param default_value: The `default_value` parameter is an optional parameter that specifies the
1660        default value for the newly added column. If a default value is provided, it will be assigned to
1661        the column for any existing rows that do not have a value for that column
1662        :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column
1663        if it already exists in the table. If `drop` is set to `True`, the function will drop the
1664        existing column before adding the new column. If `drop` is set to `False` (default),, defaults
1665        to False
1666        :type drop: bool (optional)
1667        :return: a boolean value indicating whether the column was successfully added to the table.
1668        """
1669
1670        # added
1671        added = False
1672        dropped = False
1673
1674        # Check if the column already exists in the table
1675        query = f""" SELECT * FROM {table_name} LIMIT 0 """
1676        columns = self.get_query_to_df(query).columns.tolist()
1677        if column_name.upper() in [c.upper() for c in columns]:
1678            log.debug(
1679                f"The {column_name} column already exists in the {table_name} table"
1680            )
1681            if drop:
1682                self.drop_column(table_name=table_name, column_name=column_name)
1683                dropped = True
1684            else:
1685                return None
1686        else:
1687            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
1688
1689        # Add column in table
1690        add_column_query = (
1691            f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """
1692        )
1693        if default_value is not None:
1694            add_column_query += f" DEFAULT {default_value}"
1695        self.execute_query(add_column_query)
1696        added = not dropped
1697        log.debug(
1698            f"The {column_name} column was successfully added to the {table_name} table"
1699        )
1700
1701        if added:
1702            added_column = {
1703                "table_name": table_name,
1704                "column_name": column_name,
1705                "column_type": column_type,
1706                "default_value": default_value,
1707            }
1708        else:
1709            added_column = None
1710
1711        return added_column

The add_column function adds a column to a SQLite or DuckDB table with a default value if it doesn't already exist.

Parameters
  • table_name: The name of the table to which you want to add a column
  • column_name: The parameter "column_name" is the name of the column that you want to add to the table
  • column_type: The column_type parameter specifies the data type of the column that you want to add to the table. It should be a string that represents the desired data type, such as "INTEGER", "TEXT", "REAL", etc
  • default_value: The default_value parameter is an optional parameter that specifies the default value for the newly added column. If a default value is provided, it will be assigned to the column for any existing rows that do not have a value for that column
  • drop: The drop parameter is a boolean flag that determines whether to drop the column if it already exists in the table. If drop is set to True, the function will drop the existing column before adding the new column. If drop is set to False (default),, defaults to False
Returns

a boolean value indicating whether the column was successfully added to the table.

def drop_column( self, column: dict = None, table_name: str = None, column_name: str = None) -> bool:
1713    def drop_column(
1714        self, column: dict = None, table_name: str = None, column_name: str = None
1715    ) -> bool:
1716        """
1717        The `drop_column` function drops a specified column from a given table in a database and returns
1718        True if the column was successfully dropped, and False if the column does not exist in the
1719        table.
1720
1721        :param column: The `column` parameter is a dictionary that contains information about the column
1722        you want to drop. It has two keys:
1723        :type column: dict
1724        :param table_name: The `table_name` parameter is the name of the table from which you want to
1725        drop a column
1726        :type table_name: str
1727        :param column_name: The `column_name` parameter is the name of the column that you want to drop
1728        from the table
1729        :type column_name: str
1730        :return: a boolean value. It returns True if the column was successfully dropped from the table,
1731        and False if the column does not exist in the table.
1732        """
1733
1734        # Find column infos
1735        if column:
1736            if isinstance(column, dict):
1737                table_name = column.get("table_name", None)
1738                column_name = column.get("column_name", None)
1739            elif isinstance(column, str):
1740                table_name = self.get_table_variants()
1741                column_name = column
1742            else:
1743                table_name = None
1744                column_name = None
1745
1746        if not table_name and not column_name:
1747            return False
1748
1749        # Removed
1750        removed = False
1751
1752        # Check if the column already exists in the table
1753        query = f""" SELECT * FROM {table_name} LIMIT 0 """
1754        columns = self.get_query_to_df(query).columns.tolist()
1755        if column_name in columns:
1756            log.debug(f"The {column_name} column exists in the {table_name} table")
1757        else:
1758            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
1759            return False
1760
1761        # Add column in table # ALTER TABLE integers DROP k
1762        add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """
1763        self.execute_query(add_column_query)
1764        removed = True
1765        log.debug(
1766            f"The {column_name} column was successfully dropped to the {table_name} table"
1767        )
1768
1769        return removed

The drop_column function drops a specified column from a given table in a database and returns True if the column was successfully dropped, and False if the column does not exist in the table.

Parameters
  • column: The column parameter is a dictionary that contains information about the column you want to drop. It has two keys:
  • table_name: The table_name parameter is the name of the table from which you want to drop a column
  • column_name: The column_name parameter is the name of the column that you want to drop from the table
Returns

a boolean value. It returns True if the column was successfully dropped from the table, and False if the column does not exist in the table.

def explode_infos( self, prefix: str = None, create_index: bool = False, fields: list = None, force: bool = False, proccess_all_fields_together: bool = False, table: str = None) -> list:
1771    def explode_infos(
1772        self,
1773        prefix: str = None,
1774        create_index: bool = False,
1775        fields: list = None,
1776        force: bool = False,
1777        proccess_all_fields_together: bool = False,
1778        table: str = None,
1779    ) -> list:
1780        """
1781        The `explode_infos` function in Python takes a VCF file and explodes the INFO fields into
1782        individual columns, returning a list of added columns.
1783
1784        :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO
1785        fields. If the `prefix` is not provided or is set to `None`, the function will use the value of
1786        `self.get_explode_infos_prefix()` as the prefix
1787        :type prefix: str
1788        :param create_index: The `create_index` parameter is a boolean flag that specifies whether to
1789        create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to
1790        `False`, indexes will not be created. The default value is `False`, defaults to False
1791        :type create_index: bool (optional)
1792        :param fields: The `fields` parameter in the `explode_infos` function is a list of INFO fields
1793        that you want to explode into individual columns. If this parameter is not provided, all INFO
1794        fields will be exploded. You can specify the INFO fields you want to explode by passing them as
1795        a list to the `
1796        :type fields: list
1797        :param force: The `force` parameter in the `explode_infos` function is a boolean flag that
1798        determines whether to drop and recreate a column if it already exists in the table. If `force`
1799        is set to `True`, the column will be dropped and recreated. If `force` is set to `False,
1800        defaults to False
1801        :type force: bool (optional)
1802        :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean
1803        flag that determines whether to process all the INFO fields together or individually. If set to
1804        `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will
1805        be processed individually. The default value is, defaults to False
1806        :type proccess_all_fields_together: bool (optional)
1807        :param table: The `table` parameter in the `explode_infos` function is used to specify the name
1808        of the table where the exploded INFO fields will be added as individual columns. If you provide
1809        a value for the `table` parameter, the function will use that table name. If the `table`
1810        parameter is
1811        :type table: str
1812        :return: The `explode_infos` function returns a list of added columns.
1813        """
1814
1815        # drop indexes
1816        self.drop_indexes()
1817
1818        # connexion format
1819        connexion_format = self.get_connexion_format()
1820
1821        # Access
1822        access = self.get_config().get("access", None)
1823
1824        # Added columns
1825        added_columns = []
1826
1827        if access not in ["RO"]:
1828
1829            # prefix
1830            if prefix in [None, True] or not isinstance(prefix, str):
1831                if self.get_explode_infos_prefix() not in [None, True]:
1832                    prefix = self.get_explode_infos_prefix()
1833                else:
1834                    prefix = "INFO/"
1835
1836            # table variants
1837            if table is not None:
1838                table_variants = table
1839            else:
1840                table_variants = self.get_table_variants(clause="select")
1841
1842            # extra infos
1843            try:
1844                extra_infos = self.get_extra_infos()
1845            except:
1846                extra_infos = []
1847
1848            # Header infos
1849            header_infos = self.get_header().infos
1850
1851            log.debug(
1852                f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields"
1853            )
1854
1855            sql_info_alter_table_array = []
1856
1857            # Info fields to check
1858            fields_list = list(header_infos)
1859            if fields:
1860                fields_list += fields
1861            fields_list = set(fields_list)
1862
1863            # If no fields
1864            if not fields:
1865                fields = []
1866
1867            # Translate fields if patterns
1868            fields = self.get_explode_infos_fields(explode_infos_fields=fields)
1869
1870            for info in fields:
1871
1872                info_id_sql = prefix + info
1873
1874                if (
1875                    info in fields_list
1876                    or prefix + info in fields_list
1877                    or info in extra_infos
1878                ):
1879
1880                    log.debug(f"Explode INFO fields - ADD '{info}' annotations fields")
1881
1882                    if info in header_infos:
1883                        info_type = header_infos[info].type
1884                        info_num = header_infos[info].num
1885                    else:
1886                        info_type = "String"
1887                        info_num = 0
1888
1889                    type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR")
1890                    if info_num != 1:
1891                        type_sql = "VARCHAR"
1892
1893                    # Add field
1894                    added_column = self.add_column(
1895                        table_name=table_variants,
1896                        column_name=info_id_sql,
1897                        column_type=type_sql,
1898                        default_value="null",
1899                        drop=force,
1900                    )
1901
1902                    if added_column:
1903                        added_columns.append(added_column)
1904
1905                    if added_column or force:
1906
1907                        # add field to index
1908                        self.index_additionnal_fields.append(info_id_sql)
1909
1910                        # Update field array
1911                        if connexion_format in ["duckdb"]:
1912                            update_info_field = f"""
1913                            "{info_id_sql}" =
1914                                CASE
1915                                    WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL
1916                                    ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1)
1917                                END
1918                            """
1919                        elif connexion_format in ["sqlite"]:
1920                            update_info_field = f"""
1921                                "{info_id_sql}" =
1922                                    CASE
1923                                        WHEN instr(INFO, '{info}=') = 0 THEN NULL
1924                                        WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1)
1925                                        ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1)
1926                                    END
1927                            """
1928
1929                        sql_info_alter_table_array.append(update_info_field)
1930
1931            if sql_info_alter_table_array:
1932
1933                # By chromosomes
1934                try:
1935                    chromosomes_list = list(
1936                        self.get_query_to_df(
1937                            f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """
1938                        )["#CHROM"]
1939                    )
1940                except:
1941                    chromosomes_list = [None]
1942
1943                for chrom in chromosomes_list:
1944                    log.debug(f"Explode INFO fields - Chromosome {chrom}...")
1945
1946                    # Where clause
1947                    where_clause = ""
1948                    if chrom and len(chromosomes_list) > 1:
1949                        where_clause = f""" WHERE "#CHROM" = '{chrom}' """
1950
1951                    # Update table
1952                    if proccess_all_fields_together:
1953                        sql_info_alter_table_array_join = ", ".join(
1954                            sql_info_alter_table_array
1955                        )
1956                        if sql_info_alter_table_array_join:
1957                            sql_info_alter_table = f"""
1958                                UPDATE {table_variants}
1959                                SET {sql_info_alter_table_array_join}
1960                                {where_clause}
1961                                """
1962                            log.debug(
1963                                f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..."
1964                            )
1965                            # log.debug(sql_info_alter_table)
1966                            self.conn.execute(sql_info_alter_table)
1967                    else:
1968                        sql_info_alter_num = 0
1969                        for sql_info_alter in sql_info_alter_table_array:
1970                            sql_info_alter_num += 1
1971                            sql_info_alter_table = f"""
1972                                UPDATE {table_variants}
1973                                SET {sql_info_alter}
1974                                {where_clause}
1975                                """
1976                            log.debug(
1977                                f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..."
1978                            )
1979                            # log.debug(sql_info_alter_table)
1980                            self.conn.execute(sql_info_alter_table)
1981
1982        # create indexes
1983        if create_index:
1984            self.create_indexes()
1985
1986        return added_columns

The explode_infos function in Python takes a VCF file and explodes the INFO fields into individual columns, returning a list of added columns.

Parameters
  • prefix: The prefix parameter is a string that is used as a prefix for the exploded INFO fields. If the prefix is not provided or is set to None, the function will use the value of self.get_explode_infos_prefix() as the prefix
  • create_index: The create_index parameter is a boolean flag that specifies whether to create indexes on the exploded INFO fields. If set to True, indexes will be created; if set to False, indexes will not be created. The default value is False, defaults to False
  • fields: The fields parameter in the explode_infos function is a list of INFO fields that you want to explode into individual columns. If this parameter is not provided, all INFO fields will be exploded. You can specify the INFO fields you want to explode by passing them as a list to the `
  • force: The force parameter in the explode_infos function is a boolean flag that determines whether to drop and recreate a column if it already exists in the table. If force is set to True, the column will be dropped and recreated. If force is set to `False, defaults to False
  • proccess_all_fields_together: The proccess_all_fields_together parameter is a boolean flag that determines whether to process all the INFO fields together or individually. If set to True, all the INFO fields will be processed together. If set to False, each INFO field will be processed individually. The default value is, defaults to False
  • table: The table parameter in the explode_infos function is used to specify the name of the table where the exploded INFO fields will be added as individual columns. If you provide a value for the table parameter, the function will use that table name. If the table parameter is
Returns

The explode_infos function returns a list of added columns.

def create_indexes(self) -> None:
1988    def create_indexes(self) -> None:
1989        """
1990        Create indexes on the table after insertion
1991        """
1992
1993        # Access
1994        access = self.get_config().get("access", None)
1995
1996        # get table variants
1997        table_variants = self.get_table_variants("FROM")
1998
1999        if self.get_indexing() and access not in ["RO"]:
2000            # Create index
2001            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")'
2002            self.conn.execute(sql_create_table_index)
2003            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")'
2004            self.conn.execute(sql_create_table_index)
2005            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")'
2006            self.conn.execute(sql_create_table_index)
2007            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")'
2008            self.conn.execute(sql_create_table_index)
2009            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")'
2010            self.conn.execute(sql_create_table_index)
2011            for field in self.index_additionnal_fields:
2012                sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """
2013                self.conn.execute(sql_create_table_index)

Create indexes on the table after insertion

def drop_indexes(self) -> None:
2015    def drop_indexes(self) -> None:
2016        """
2017        Create indexes on the table after insertion
2018        """
2019
2020        # Access
2021        access = self.get_config().get("access", None)
2022
2023        # get table variants
2024        table_variants = self.get_table_variants("FROM")
2025
2026        # Get database format
2027        connexion_format = self.get_connexion_format()
2028
2029        if access not in ["RO"]:
2030            if connexion_format in ["duckdb"]:
2031                sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'"
2032            elif connexion_format in ["sqlite"]:
2033                sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';"
2034
2035            list_indexes = self.conn.execute(sql_list_indexes)
2036            index_names = [row[0] for row in list_indexes.fetchall()]
2037            for index in index_names:
2038                sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """
2039                self.conn.execute(sql_drop_table_index)

Create indexes on the table after insertion

def read_vcf_header(self, f) -> list:
2041    def read_vcf_header(self, f) -> list:
2042        """
2043        It reads the header of a VCF file and returns a list of the header lines
2044
2045        :param f: the file object
2046        :return: The header lines of the VCF file.
2047        """
2048
2049        header_list = []
2050        for line in f:
2051            header_list.append(line)
2052            if line.startswith("#CHROM"):
2053                break
2054        return header_list

It reads the header of a VCF file and returns a list of the header lines

Parameters
  • f: the file object
Returns

The header lines of the VCF file.

def read_vcf_header_file(self, file: str = None) -> list:
2056    def read_vcf_header_file(self, file: str = None) -> list:
2057        """
2058        The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and
2059        uncompressed files.
2060
2061        :param file: The `file` parameter is a string that represents the path to the VCF header file
2062        that you want to read. It is an optional parameter, so if you don't provide a value, it will
2063        default to `None`
2064        :type file: str
2065        :return: The function `read_vcf_header_file` returns a list.
2066        """
2067
2068        if self.get_input_compressed(input_file=file):
2069            with bgzf.open(file, "rt") as f:
2070                return self.read_vcf_header(f=f)
2071        else:
2072            with open(file, "rt") as f:
2073                return self.read_vcf_header(f=f)

The read_vcf_header_file function reads the header of a VCF file, handling both compressed and uncompressed files.

Parameters
  • file: The file parameter is a string that represents the path to the VCF header file that you want to read. It is an optional parameter, so if you don't provide a value, it will default to None
Returns

The function read_vcf_header_file returns a list.

def execute_query(self, query: str):
2075    def execute_query(self, query: str):
2076        """
2077        It takes a query as an argument, executes it, and returns the results
2078
2079        :param query: The query to be executed
2080        :return: The result of the query is being returned.
2081        """
2082        if query:
2083            return self.conn.execute(query)  # .fetchall()
2084        else:
2085            return None

It takes a query as an argument, executes it, and returns the results

Parameters
  • query: The query to be executed
Returns

The result of the query is being returned.

def export_output( self, output_file: str | None = None, output_header: str | None = None, export_header: bool = True, query: str | None = None, parquet_partitions: list | None = None, chunk_size: int | None = None, threads: int | None = None, sort: bool = False, index: bool = False, order_by: str | None = None) -> bool:
2087    def export_output(
2088        self,
2089        output_file: str | None = None,
2090        output_header: str | None = None,
2091        export_header: bool = True,
2092        query: str | None = None,
2093        parquet_partitions: list | None = None,
2094        chunk_size: int | None = None,
2095        threads: int | None = None,
2096        sort: bool = False,
2097        index: bool = False,
2098        order_by: str | None = None,
2099    ) -> bool:
2100        """
2101        The `export_output` function exports data from a VCF file to a specified output file in various
2102        formats, including VCF, CSV, TSV, PSV, and Parquet.
2103
2104        :param output_file: The `output_file` parameter is a string that specifies the name of the
2105        output file to be generated by the function. This is where the exported data will be saved
2106        :type output_file: str
2107        :param output_header: The `output_header` parameter is a string that specifies the name of the
2108        file where the header of the VCF file will be exported. If this parameter is not provided, the
2109        header will be exported to a file with the same name as the `output_file` parameter, but with
2110        the extension "
2111        :type output_header: str
2112        :param export_header: The `export_header` parameter is a boolean flag that determines whether
2113        the header of a VCF file should be exported to a separate file or not. If `export_header` is
2114        True, the header will be exported to a file. If `export_header` is False, the header will not
2115        be, defaults to True, if output format is not VCF
2116        :type export_header: bool (optional)
2117        :param query: The `query` parameter is an optional SQL query that can be used to filter and
2118        select specific data from the VCF file before exporting it. If provided, only the data that
2119        matches the query will be exported
2120        :type query: str
2121        :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the
2122        columns to be used for partitioning the Parquet file during export. Partitioning is a way to
2123        organize data in a hierarchical directory structure based on the values of one or more columns.
2124        This can improve query performance when working with large datasets
2125        :type parquet_partitions: list
2126        :param chunk_size: The `chunk_size` parameter specifies the number of
2127        records in batch when exporting data in Parquet format. This parameter is used for
2128        partitioning the Parquet file into multiple files.
2129        :type chunk_size: int
2130        :param threads: The `threads` parameter is an optional parameter that specifies the number of
2131        threads to be used during the export process. It determines the level of parallelism and can
2132        improve the performance of the export operation. If not provided, the function will use the
2133        default number of threads
2134        :type threads: int
2135        :param sort: The `sort` parameter is a boolean flag that determines whether the output file
2136        should be sorted or not. If `sort` is set to `True`, the output file will be sorted based on the
2137        genomic coordinates of the variants. By default, the value of `sort` is `False`, defaults to
2138        False
2139        :type sort: bool (optional)
2140        :param index: The `index` parameter is a boolean flag that determines whether an index should be
2141        created on the output file. If `index` is True, an index will be created. If `index` is False,
2142        no index will be created. The default value is False, defaults to False
2143        :type index: bool (optional)
2144        :param order_by: The `order_by` parameter is a string that specifies the column(s) to use for
2145        sorting the output file. This parameter is only applicable when exporting data in VCF format
2146        :type order_by: str
2147        :return: a boolean value. It checks if the output file exists and returns True if it does, or
2148        None if it doesn't.
2149        """
2150
2151        # Log
2152        log.info("Exporting...")
2153
2154        # Full path
2155        output_file = full_path(output_file)
2156        output_header = full_path(output_header)
2157
2158        # Config
2159        config = self.get_config()
2160
2161        # Param
2162        param = self.get_param()
2163
2164        # Tmp files to remove
2165        tmp_to_remove = []
2166
2167        # If no output, get it
2168        if not output_file:
2169            output_file = self.get_output()
2170
2171        # If not threads
2172        if not threads:
2173            threads = self.get_threads()
2174
2175        # Auto header name with extension
2176        if export_header or output_header:
2177            if not output_header:
2178                output_header = f"{output_file}.hdr"
2179            # Export header
2180            self.export_header(output_file=output_file)
2181
2182        # Switch off export header if VCF output
2183        output_file_type = get_file_format(output_file)
2184        if output_file_type in ["vcf"]:
2185            export_header = False
2186            tmp_to_remove.append(output_header)
2187
2188        # Chunk size
2189        if not chunk_size:
2190            chunk_size = config.get("chunk_size", None)
2191
2192        # Parquet partition
2193        if not parquet_partitions:
2194            parquet_partitions = param.get("export", {}).get("parquet_partitions", None)
2195        if parquet_partitions and isinstance(parquet_partitions, str):
2196            parquet_partitions = parquet_partitions.split(",")
2197
2198        # Order by
2199        if not order_by:
2200            order_by = param.get("export", {}).get("order_by", "")
2201
2202        # Header in output
2203        header_in_output = param.get("export", {}).get("include_header", False)
2204
2205        # Database
2206        database_source = self.get_connexion()
2207
2208        # Connexion format
2209        connexion_format = self.get_connexion_format()
2210
2211        # Explode infos
2212        if self.get_explode_infos():
2213            self.explode_infos(
2214                prefix=self.get_explode_infos_prefix(),
2215                fields=self.get_explode_infos_fields(),
2216                force=False,
2217            )
2218
2219        # if connexion_format in ["sqlite"] or query:
2220        if connexion_format in ["sqlite"]:
2221
2222            # Export in Parquet
2223            random_tmp = "".join(
2224                random.choice(string.ascii_lowercase) for i in range(10)
2225            )
2226            database_source = f"""{output_file}.{random_tmp}.database_export.parquet"""
2227            tmp_to_remove.append(database_source)
2228
2229            # Table Variants
2230            table_variants = self.get_table_variants()
2231
2232            # Create export query
2233            sql_query_export_subquery = f"""
2234                SELECT * FROM {table_variants}
2235                """
2236
2237            # Write source file
2238            fp.write(database_source, self.get_query_to_df(sql_query_export_subquery))
2239
2240        # Create database
2241        database = Database(
2242            database=database_source,
2243            table="variants",
2244            header_file=output_header,
2245            conn_config=self.get_connexion_config(),
2246        )
2247
2248        # Existing colomns header
2249        existing_columns_header = database.get_header_columns_from_database(query=query)
2250
2251        # Sample list
2252        if output_file_type in ["vcf"]:
2253            get_samples = self.get_samples()
2254            get_samples_check = self.get_samples_check()
2255            samples_force = get_samples is not None
2256            sample_list = self.get_header_sample_list(
2257                check=get_samples_check,
2258                samples=get_samples,
2259                samples_force=samples_force,
2260            )
2261        else:
2262            sample_list = None
2263
2264        # Export file
2265        database.export(
2266            output_database=output_file,
2267            output_header=output_header,
2268            existing_columns_header=existing_columns_header,
2269            parquet_partitions=parquet_partitions,
2270            chunk_size=chunk_size,
2271            threads=threads,
2272            sort=sort,
2273            index=index,
2274            header_in_output=header_in_output,
2275            order_by=order_by,
2276            query=query,
2277            export_header=export_header,
2278            sample_list=sample_list,
2279        )
2280
2281        # Remove
2282        remove_if_exists(tmp_to_remove)
2283
2284        return (os.path.exists(output_file) or None) and (
2285            os.path.exists(output_file) or None
2286        )

The export_output function exports data from a VCF file to a specified output file in various formats, including VCF, CSV, TSV, PSV, and Parquet.

Parameters
  • output_file: The output_file parameter is a string that specifies the name of the output file to be generated by the function. This is where the exported data will be saved
  • output_header: The output_header parameter is a string that specifies the name of the file where the header of the VCF file will be exported. If this parameter is not provided, the header will be exported to a file with the same name as the output_file parameter, but with the extension "
  • export_header: The export_header parameter is a boolean flag that determines whether the header of a VCF file should be exported to a separate file or not. If export_header is True, the header will be exported to a file. If export_header is False, the header will not be, defaults to True, if output format is not VCF
  • query: The query parameter is an optional SQL query that can be used to filter and select specific data from the VCF file before exporting it. If provided, only the data that matches the query will be exported
  • parquet_partitions: The parquet_partitions parameter is a list that specifies the columns to be used for partitioning the Parquet file during export. Partitioning is a way to organize data in a hierarchical directory structure based on the values of one or more columns. This can improve query performance when working with large datasets
  • chunk_size: The chunk_size parameter specifies the number of records in batch when exporting data in Parquet format. This parameter is used for partitioning the Parquet file into multiple files.
  • threads: The threads parameter is an optional parameter that specifies the number of threads to be used during the export process. It determines the level of parallelism and can improve the performance of the export operation. If not provided, the function will use the default number of threads
  • sort: The sort parameter is a boolean flag that determines whether the output file should be sorted or not. If sort is set to True, the output file will be sorted based on the genomic coordinates of the variants. By default, the value of sort is False, defaults to False
  • index: The index parameter is a boolean flag that determines whether an index should be created on the output file. If index is True, an index will be created. If index is False, no index will be created. The default value is False, defaults to False
  • order_by: The order_by parameter is a string that specifies the column(s) to use for sorting the output file. This parameter is only applicable when exporting data in VCF format
Returns

a boolean value. It checks if the output file exists and returns True if it does, or None if it doesn't.

def get_extra_infos(self, table: str = None) -> list:
2288    def get_extra_infos(self, table: str = None) -> list:
2289        """
2290        The `get_extra_infos` function returns a list of columns that are in a specified table but not
2291        in the header.
2292
2293        :param table: The `table` parameter in the `get_extra_infos` function is used to specify the
2294        name of the table from which you want to retrieve the extra columns that are not present in the
2295        header. If the `table` parameter is not provided when calling the function, it will default to
2296        using the variants
2297        :type table: str
2298        :return: A list of columns that are in the specified table but not in the header of the table.
2299        """
2300
2301        header_columns = []
2302
2303        if not table:
2304            table = self.get_table_variants(clause="from")
2305            header_columns = self.get_header_columns()
2306
2307        # Check all columns in the database
2308        query = f""" SELECT * FROM {table} LIMIT 1 """
2309        log.debug(f"query {query}")
2310        table_columns = self.get_query_to_df(query).columns.tolist()
2311        extra_columns = []
2312
2313        # Construct extra infos (not in header)
2314        for column in table_columns:
2315            if column not in header_columns:
2316                extra_columns.append(column)
2317
2318        return extra_columns

The get_extra_infos function returns a list of columns that are in a specified table but not in the header.

Parameters
  • table: The table parameter in the get_extra_infos function is used to specify the name of the table from which you want to retrieve the extra columns that are not present in the header. If the table parameter is not provided when calling the function, it will default to using the variants
Returns

A list of columns that are in the specified table but not in the header of the table.

def get_extra_infos_sql(self, table: str = None) -> str:
2320    def get_extra_infos_sql(self, table: str = None) -> str:
2321        """
2322        It returns a string of the extra infos, separated by commas, and each extra info is surrounded
2323        by double quotes
2324
2325        :param table: The name of the table to get the extra infos from. If None, the default table is
2326        used
2327        :type table: str
2328        :return: A string of the extra infos
2329        """
2330
2331        return ", ".join(
2332            ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)]
2333        )

It returns a string of the extra infos, separated by commas, and each extra info is surrounded by double quotes

Parameters
  • table: The name of the table to get the extra infos from. If None, the default table is used
Returns

A string of the extra infos

def export_header( self, header_name: str = None, output_file: str = None, output_file_ext: str = '.hdr', clean_header: bool = True, remove_chrom_line: bool = False) -> str:
2335    def export_header(
2336        self,
2337        header_name: str = None,
2338        output_file: str = None,
2339        output_file_ext: str = ".hdr",
2340        clean_header: bool = True,
2341        remove_chrom_line: bool = False,
2342    ) -> str:
2343        """
2344        The `export_header` function takes a VCF file, extracts the header, modifies it according to
2345        specified options, and writes it to a new file.
2346
2347        :param header_name: The `header_name` parameter is the name of the header file to be created. If
2348        this parameter is not specified, the header will be written to the output file
2349        :type header_name: str
2350        :param output_file: The `output_file` parameter in the `export_header` function is used to
2351        specify the name of the output file where the header will be written. If this parameter is not
2352        provided, the header will be written to a temporary file
2353        :type output_file: str
2354        :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a
2355        string that represents the extension of the output header file. By default, it is set to ".hdr"
2356        if not specified by the user. This extension will be appended to the `output_file` name to
2357        create the final, defaults to .hdr
2358        :type output_file_ext: str (optional)
2359        :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean
2360        flag that determines whether the header should be cleaned or not. When `clean_header` is set to
2361        `True`, the function will clean the header by modifying certain lines based on a specific
2362        pattern. If `clean_header`, defaults to True
2363        :type clean_header: bool (optional)
2364        :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a
2365        boolean flag that determines whether the #CHROM line should be removed from the header before
2366        writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `,
2367        defaults to False
2368        :type remove_chrom_line: bool (optional)
2369        :return: The function `export_header` returns the name of the temporary header file that is
2370        created.
2371        """
2372
2373        if not header_name and not output_file:
2374            output_file = self.get_output()
2375
2376        if self.get_header():
2377
2378            # Get header object
2379            header_obj = self.get_header()
2380
2381            # Create database
2382            db_for_header = Database(database=self.get_input())
2383
2384            # Get real columns in the file
2385            db_header_columns = db_for_header.get_columns()
2386
2387            with tempfile.TemporaryDirectory() as tmpdir:
2388
2389                # Write header file
2390                header_file_tmp = os.path.join(tmpdir, "header")
2391                f = open(header_file_tmp, "w")
2392                vcf.Writer(f, header_obj)
2393                f.close()
2394
2395                # Replace #CHROM line with rel columns
2396                header_list = db_for_header.read_header_file(
2397                    header_file=header_file_tmp
2398                )
2399                header_list[-1] = "\t".join(db_header_columns)
2400
2401                # Remove CHROM line
2402                if remove_chrom_line:
2403                    header_list.pop()
2404
2405                # Clean header
2406                if clean_header:
2407                    header_list_clean = []
2408                    for head in header_list:
2409                        # Clean head for malformed header
2410                        head_clean = head
2411                        head_clean = re.subn(
2412                            "##FORMAT=<ID=(.*),Number=(.*),Type=Flag",
2413                            r"##FORMAT=<ID=\1,Number=\2,Type=String",
2414                            head_clean,
2415                            2,
2416                        )[0]
2417                        # Write header
2418                        header_list_clean.append(head_clean)
2419                    header_list = header_list_clean
2420
2421            tmp_header_name = output_file + output_file_ext
2422
2423            f = open(tmp_header_name, "w")
2424            for line in header_list:
2425                f.write(line)
2426            f.close()
2427
2428        return tmp_header_name

The export_header function takes a VCF file, extracts the header, modifies it according to specified options, and writes it to a new file.

Parameters
  • header_name: The header_name parameter is the name of the header file to be created. If this parameter is not specified, the header will be written to the output file
  • output_file: The output_file parameter in the export_header function is used to specify the name of the output file where the header will be written. If this parameter is not provided, the header will be written to a temporary file
  • output_file_ext: The output_file_ext parameter in the export_header function is a string that represents the extension of the output header file. By default, it is set to ".hdr" if not specified by the user. This extension will be appended to the output_file name to create the final, defaults to .hdr
  • clean_header: The clean_header parameter in the export_header function is a boolean flag that determines whether the header should be cleaned or not. When clean_header is set to True, the function will clean the header by modifying certain lines based on a specific pattern. If clean_header, defaults to True
  • remove_chrom_line: The remove_chrom_line parameter in the export_header function is a boolean flag that determines whether the #CHROM line should be removed from the header before writing it to the output file. If set to True, the #CHROM line will be removed; if set to `, defaults to False
Returns

The function export_header returns the name of the temporary header file that is created.

def export_variant_vcf( self, vcf_file, remove_info: bool = False, add_samples: bool = True, list_samples: list = [], where_clause: str = '', index: bool = False, threads: int | None = None) -> bool | None:
2430    def export_variant_vcf(
2431        self,
2432        vcf_file,
2433        remove_info: bool = False,
2434        add_samples: bool = True,
2435        list_samples: list = [],
2436        where_clause: str = "",
2437        index: bool = False,
2438        threads: int | None = None,
2439    ) -> bool | None:
2440        """
2441        The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to
2442        remove INFO field, add samples, and control compression and indexing.
2443
2444        :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be
2445        written to. It is the output file that will contain the filtered VCF data based on the specified
2446        parameters
2447        :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a
2448        boolean flag that determines whether to remove the INFO field from the output VCF file. If set
2449        to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included
2450        in, defaults to False
2451        :type remove_info: bool (optional)
2452        :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether
2453        the samples should be added to the VCF file or not. If set to True, the samples will be added.
2454        If set to False, the samples will be removed. The default value is True, defaults to True
2455        :type add_samples: bool (optional)
2456        :param list_samples: The `list_samples` parameter is a list of samples that you want to include
2457        in the output VCF file. By default, all samples will be included. If you provide a list of
2458        samples, only those samples will be included in the output file
2459        :type list_samples: list
2460        :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that
2461        determines whether or not to create an index for the output VCF file. If `index` is set to
2462        `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False
2463        :type index: bool (optional)
2464        :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the
2465        number of threads to use for exporting the VCF file. It determines how many parallel threads
2466        will be used during the export process. More threads can potentially speed up the export process
2467        by utilizing multiple cores of the processor. If
2468        :type threads: int | None
2469        :return: The `export_variant_vcf` function returns the result of calling the `export_output`
2470        method with various parameters including the output file, query, threads, sort flag, and index
2471        flag. The `export_output` method is responsible for exporting the VCF data based on the
2472        specified parameters and configurations provided in the `export_variant_vcf` function.
2473        """
2474
2475        # Config
2476        config = self.get_config()
2477
2478        # Extract VCF
2479        log.debug("Export VCF...")
2480
2481        # Table variants
2482        table_variants = self.get_table_variants()
2483
2484        # Threads
2485        if not threads:
2486            threads = self.get_threads()
2487
2488        # Info fields
2489        if remove_info:
2490            if not isinstance(remove_info, str):
2491                remove_info = "."
2492            info_field = f"""'{remove_info}' as INFO"""
2493        else:
2494            info_field = "INFO"
2495
2496        # Samples fields
2497        if add_samples:
2498            if not list_samples:
2499                list_samples = self.get_header_sample_list()
2500            if list_samples:
2501                samples_fields = " , FORMAT , " + " , ".join(list_samples)
2502            else:
2503                samples_fields = ""
2504            log.debug(f"samples_fields: {samples_fields}")
2505        else:
2506            samples_fields = ""
2507
2508        # Where clause
2509        if where_clause is None:
2510            where_clause = ""
2511
2512        # Variants
2513        select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """
2514        sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """
2515        log.debug(f"sql_query_select={sql_query_select}")
2516
2517        return self.export_output(
2518            output_file=vcf_file,
2519            output_header=None,
2520            export_header=True,
2521            query=sql_query_select,
2522            parquet_partitions=None,
2523            chunk_size=config.get("chunk_size", None),
2524            threads=threads,
2525            sort=True,
2526            index=index,
2527            order_by=None,
2528        )

The export_variant_vcf function exports a VCF file with specified samples, allowing options to remove INFO field, add samples, and control compression and indexing.

Parameters
  • vcf_file: The vcf_file parameter is the name of the file where the VCF data will be written to. It is the output file that will contain the filtered VCF data based on the specified parameters
  • remove_info: The remove_info parameter in the export_variant_vcf function is a boolean flag that determines whether to remove the INFO field from the output VCF file. If set to True, the INFO field will be removed. If set to False, the INFO field will be included in, defaults to False
  • add_samples: The add_samples parameter is a boolean parameter that determines whether the samples should be added to the VCF file or not. If set to True, the samples will be added. If set to False, the samples will be removed. The default value is True, defaults to True
  • list_samples: The list_samples parameter is a list of samples that you want to include in the output VCF file. By default, all samples will be included. If you provide a list of samples, only those samples will be included in the output file
  • index: The index parameter in the export_variant_vcf function is a boolean flag that determines whether or not to create an index for the output VCF file. If index is set to True, the output VCF file will be indexed using tabix. If index, defaults to False
  • threads: The threads parameter in the export_variant_vcf function specifies the number of threads to use for exporting the VCF file. It determines how many parallel threads will be used during the export process. More threads can potentially speed up the export process by utilizing multiple cores of the processor. If
Returns

The export_variant_vcf function returns the result of calling the export_output method with various parameters including the output file, query, threads, sort flag, and index flag. The export_output method is responsible for exporting the VCF data based on the specified parameters and configurations provided in the export_variant_vcf function.

def run_commands(self, commands: list = [], threads: int = 1) -> None:
2530    def run_commands(self, commands: list = [], threads: int = 1) -> None:
2531        """
2532        It takes a list of commands and runs them in parallel using the number of threads specified
2533
2534        :param commands: A list of commands to run
2535        :param threads: The number of threads to use, defaults to 1 (optional)
2536        """
2537
2538        run_parallel_commands(commands, threads)

It takes a list of commands and runs them in parallel using the number of threads specified

Parameters
  • commands: A list of commands to run
  • threads: The number of threads to use, defaults to 1 (optional)
def get_threads(self, default: int = 1) -> int:
2540    def get_threads(self, default: int = 1) -> int:
2541        """
2542        This function returns the number of threads to use for a job, with a default value of 1 if not
2543        specified.
2544
2545        :param default: The `default` parameter in the `get_threads` method is used to specify the
2546        default number of threads to use if no specific value is provided. If no value is provided for
2547        the `threads` parameter in the configuration or input parameters, the `default` value will be
2548        used, defaults to 1
2549        :type default: int (optional)
2550        :return: the number of threads to use for the current job.
2551        """
2552
2553        # Config
2554        config = self.get_config()
2555
2556        # Param
2557        param = self.get_param()
2558
2559        # Input threads
2560        input_thread = param.get("threads", config.get("threads", None))
2561
2562        # Check threads
2563        if not input_thread:
2564            threads = default
2565        elif int(input_thread) <= 0:
2566            threads = os.cpu_count()
2567        else:
2568            threads = int(input_thread)
2569        return threads

This function returns the number of threads to use for a job, with a default value of 1 if not specified.

Parameters
  • default: The default parameter in the get_threads method is used to specify the default number of threads to use if no specific value is provided. If no value is provided for the threads parameter in the configuration or input parameters, the default value will be used, defaults to 1
Returns

the number of threads to use for the current job.

def get_memory(self, default: str = None) -> str:
2571    def get_memory(self, default: str = None) -> str:
2572        """
2573        This function retrieves the memory value from parameters or configuration with a default value
2574        if not found.
2575
2576        :param default: The `get_memory` function takes in a default value as a string parameter. This
2577        default value is used as a fallback in case the `memory` parameter is not provided in the
2578        `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary,
2579        the function
2580        :type default: str
2581        :return: The `get_memory` function returns a string value representing the memory parameter. If
2582        the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will
2583        return the default value provided as an argument to the function.
2584        """
2585
2586        # Config
2587        config = self.get_config()
2588
2589        # Param
2590        param = self.get_param()
2591
2592        # Input threads
2593        input_memory = param.get("memory", config.get("memory", None))
2594
2595        # Check threads
2596        if input_memory:
2597            memory = input_memory
2598        else:
2599            memory = default
2600
2601        return memory

This function retrieves the memory value from parameters or configuration with a default value if not found.

Parameters
  • default: The get_memory function takes in a default value as a string parameter. This default value is used as a fallback in case the memory parameter is not provided in the param dictionary or the config dictionary. If memory is not found in either dictionary, the function
Returns

The get_memory function returns a string value representing the memory parameter. If the input_memory is provided in the parameters, it will return that value. Otherwise, it will return the default value provided as an argument to the function.

def update_from_vcf(self, vcf_file: str) -> None:
2603    def update_from_vcf(self, vcf_file: str) -> None:
2604        """
2605        > If the database is duckdb, then use the parquet method, otherwise use the sqlite method
2606
2607        :param vcf_file: the path to the VCF file
2608        """
2609
2610        connexion_format = self.get_connexion_format()
2611
2612        if connexion_format in ["duckdb"]:
2613            self.update_from_vcf_duckdb(vcf_file)
2614        elif connexion_format in ["sqlite"]:
2615            self.update_from_vcf_sqlite(vcf_file)

If the database is duckdb, then use the parquet method, otherwise use the sqlite method

Parameters
  • vcf_file: the path to the VCF file
def update_from_vcf_duckdb(self, vcf_file: str) -> None:
2617    def update_from_vcf_duckdb(self, vcf_file: str) -> None:
2618        """
2619        It takes a VCF file and updates the INFO column of the variants table in the database with the
2620        INFO column of the VCF file
2621
2622        :param vcf_file: the path to the VCF file
2623        """
2624
2625        # varaints table
2626        table_variants = self.get_table_variants()
2627
2628        # Loading VCF into temporaire table
2629        skip = self.get_header_length(file=vcf_file)
2630        vcf_df = pd.read_csv(
2631            vcf_file,
2632            sep="\t",
2633            engine="c",
2634            skiprows=skip,
2635            header=0,
2636            low_memory=False,
2637        )
2638        sql_query_update = f"""
2639        UPDATE {table_variants} as table_variants
2640            SET INFO = concat(
2641                            CASE
2642                                WHEN INFO NOT IN ('', '.')
2643                                THEN INFO
2644                                ELSE ''
2645                            END,
2646                            (
2647                                SELECT 
2648                                    concat(
2649                                        CASE
2650                                            WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.')
2651                                            THEN ';'
2652                                            ELSE ''
2653                                        END
2654                                        ,
2655                                        CASE
2656                                            WHEN table_parquet.INFO NOT IN ('','.')
2657                                            THEN table_parquet.INFO
2658                                            ELSE ''
2659                                        END
2660                                    )
2661                                FROM vcf_df as table_parquet
2662                                        WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR)
2663                                        AND table_parquet.\"POS\" = table_variants.\"POS\"
2664                                        AND table_parquet.\"ALT\" = table_variants.\"ALT\"
2665                                        AND table_parquet.\"REF\" = table_variants.\"REF\"
2666                                        AND table_parquet.INFO NOT IN ('','.')
2667                            )
2668                        )
2669            ;
2670            """
2671        self.conn.execute(sql_query_update)

It takes a VCF file and updates the INFO column of the variants table in the database with the INFO column of the VCF file

Parameters
  • vcf_file: the path to the VCF file
def update_from_vcf_sqlite(self, vcf_file: str) -> None:
2673    def update_from_vcf_sqlite(self, vcf_file: str) -> None:
2674        """
2675        It creates a temporary table in the SQLite database, loads the VCF file into the temporary
2676        table, then updates the INFO column of the variants table with the INFO column of the temporary
2677        table
2678
2679        :param vcf_file: The path to the VCF file you want to update the database with
2680        """
2681
2682        # Create a temporary table for the VCF
2683        table_vcf = "tmp_vcf"
2684        sql_create = (
2685            f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0"
2686        )
2687        self.conn.execute(sql_create)
2688
2689        # Loading VCF into temporaire table
2690        vcf_df = pd.read_csv(
2691            vcf_file, sep="\t", comment="#", header=None, low_memory=False
2692        )
2693        vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"]
2694        vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False)
2695
2696        # Update table 'variants' with VCF data
2697        # warning: CONCAT as || operator
2698        sql_query_update = f"""
2699            UPDATE variants as table_variants
2700            SET INFO = CASE
2701                            WHEN INFO NOT IN ('', '.')
2702                            THEN INFO
2703                            ELSE ''
2704                        END ||
2705                        (
2706                        SELECT 
2707                            CASE 
2708                                WHEN table_variants.INFO NOT IN ('','.') 
2709                                    AND table_vcf.INFO NOT IN ('','.')  
2710                                THEN ';' 
2711                                ELSE '' 
2712                            END || 
2713                            CASE 
2714                                WHEN table_vcf.INFO NOT IN ('','.') 
2715                                THEN table_vcf.INFO 
2716                                ELSE '' 
2717                            END
2718                        FROM {table_vcf} as table_vcf
2719                        WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\"
2720                            AND table_vcf.\"POS\" = table_variants.\"POS\"
2721                            AND table_vcf.\"ALT\" = table_variants.\"ALT\"
2722                            AND table_vcf.\"REF\" = table_variants.\"REF\"
2723                        )
2724        """
2725        self.conn.execute(sql_query_update)
2726
2727        # Drop temporary table
2728        sql_drop = f"DROP TABLE {table_vcf}"
2729        self.conn.execute(sql_drop)

It creates a temporary table in the SQLite database, loads the VCF file into the temporary table, then updates the INFO column of the variants table with the INFO column of the temporary table

Parameters
  • vcf_file: The path to the VCF file you want to update the database with
def drop_variants_table(self) -> None:
2731    def drop_variants_table(self) -> None:
2732        """
2733        > This function drops the variants table
2734        """
2735
2736        table_variants = self.get_table_variants()
2737        sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}"
2738        self.conn.execute(sql_table_variants)

This function drops the variants table

def set_variant_id(self, variant_id_column: str = 'variant_id', force: bool = None) -> str:
2740    def set_variant_id(
2741        self, variant_id_column: str = "variant_id", force: bool = None
2742    ) -> str:
2743        """
2744        It adds a column to the variants table called `variant_id` and populates it with a hash of the
2745        `#CHROM`, `POS`, `REF`, and `ALT` columns
2746
2747        :param variant_id_column: The name of the column to be created in the variants table, defaults
2748        to variant_id
2749        :type variant_id_column: str (optional)
2750        :param force: If True, the variant_id column will be created even if it already exists
2751        :type force: bool
2752        :return: The name of the column that contains the variant_id
2753        """
2754
2755        # Assembly
2756        assembly = self.get_param().get(
2757            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
2758        )
2759
2760        # INFO/Tag prefix
2761        prefix = self.get_explode_infos_prefix()
2762
2763        # Explode INFO/SVTYPE
2764        added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"])
2765
2766        # variants table
2767        table_variants = self.get_table_variants()
2768
2769        # variant_id column
2770        if not variant_id_column:
2771            variant_id_column = "variant_id"
2772
2773        # Creta variant_id column
2774        if "variant_id" not in self.get_extra_infos() or force:
2775
2776            # Create column
2777            self.add_column(
2778                table_name=table_variants,
2779                column_name=variant_id_column,
2780                column_type="UBIGINT",
2781                default_value="0",
2782            )
2783
2784            # Update column
2785            self.conn.execute(
2786                f"""
2787                    UPDATE {table_variants}
2788                    SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"')
2789                """
2790            )
2791
2792        # Remove added columns
2793        for added_column in added_columns:
2794            self.drop_column(column=added_column)
2795
2796        # return variant_id column name
2797        return variant_id_column

It adds a column to the variants table called variant_id and populates it with a hash of the #CHROM, POS, REF, and ALT columns

Parameters
  • variant_id_column: The name of the column to be created in the variants table, defaults to variant_id
  • force: If True, the variant_id column will be created even if it already exists
Returns

The name of the column that contains the variant_id

def get_variant_id_column(self, variant_id_column: str = 'variant_id', force: bool = None) -> str:
2799    def get_variant_id_column(
2800        self, variant_id_column: str = "variant_id", force: bool = None
2801    ) -> str:
2802        """
2803        This function returns the variant_id column name
2804
2805        :param variant_id_column: The name of the column in the dataframe that contains the variant IDs,
2806        defaults to variant_id
2807        :type variant_id_column: str (optional)
2808        :param force: If True, will force the variant_id to be set to the value of variant_id_column. If
2809        False, will only set the variant_id if it is not already set. If None, will set the variant_id
2810        if it is not already set, or if it is set
2811        :type force: bool
2812        :return: The variant_id column name.
2813        """
2814
2815        return self.set_variant_id(variant_id_column=variant_id_column, force=force)

This function returns the variant_id column name

Parameters
  • variant_id_column: The name of the column in the dataframe that contains the variant IDs, defaults to variant_id
  • force: If True, will force the variant_id to be set to the value of variant_id_column. If False, will only set the variant_id if it is not already set. If None, will set the variant_id if it is not already set, or if it is set
Returns

The variant_id column name.

def scan_databases( self, database_formats: list = ['parquet'], database_releases: list = ['current']) -> dict:
2821    def scan_databases(
2822        self,
2823        database_formats: list = ["parquet"],
2824        database_releases: list = ["current"],
2825    ) -> dict:
2826        """
2827        The function `scan_databases` scans for available databases based on specified formats and
2828        releases.
2829
2830        :param database_formats: The `database_formats` parameter is a list that specifies the formats
2831        of the databases to be scanned. In this case, the accepted format is "parquet"
2832        :type database_formats: list ["parquet"]
2833        :param database_releases: The `database_releases` parameter is a list that specifies the
2834        releases of the databases to be scanned. In the provided function, the default value for
2835        `database_releases` is set to `["current"]`, meaning that by default, the function will scan
2836        databases that are in the "current"
2837        :type database_releases: list
2838        :return: The function `scan_databases` returns a dictionary containing information about
2839        databases that match the specified formats and releases.
2840        """
2841
2842        # Config
2843        config = self.get_config()
2844
2845        # Param
2846        param = self.get_param()
2847
2848        # Param - Assembly
2849        assembly = param.get("assembly", config.get("assembly", None))
2850        if not assembly:
2851            assembly = DEFAULT_ASSEMBLY
2852            log.warning(f"Default assembly '{assembly}'")
2853
2854        # Scan for availabled databases
2855        log.info(
2856            f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..."
2857        )
2858        databases_infos_dict = databases_infos(
2859            database_folder_releases=database_releases,
2860            database_formats=database_formats,
2861            assembly=assembly,
2862            config=config,
2863        )
2864        log.info(
2865            f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found"
2866        )
2867
2868        return databases_infos_dict

The function scan_databases scans for available databases based on specified formats and releases.

Parameters
  • database_formats: The database_formats parameter is a list that specifies the formats of the databases to be scanned. In this case, the accepted format is "parquet"
  • database_releases: The database_releases parameter is a list that specifies the releases of the databases to be scanned. In the provided function, the default value for database_releases is set to ["current"], meaning that by default, the function will scan databases that are in the "current"
Returns

The function scan_databases returns a dictionary containing information about databases that match the specified formats and releases.

def annotation(self) -> None:
2870    def annotation(self) -> None:
2871        """
2872        It annotates the VCF file with the annotations specified in the config file.
2873        """
2874
2875        # Config
2876        config = self.get_config()
2877
2878        # Param
2879        param = self.get_param()
2880
2881        # Param - Assembly
2882        assembly = param.get("assembly", config.get("assembly", None))
2883        if not assembly:
2884            assembly = DEFAULT_ASSEMBLY
2885            log.warning(f"Default assembly '{assembly}'")
2886
2887        # annotations databases folders
2888        annotations_databases = set(
2889            config.get("folders", {})
2890            .get("databases", {})
2891            .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER])
2892            + config.get("folders", {})
2893            .get("databases", {})
2894            .get("parquet", ["~/howard/databases/parquet/current"])
2895            + config.get("folders", {})
2896            .get("databases", {})
2897            .get("bcftools", ["~/howard/databases/bcftools/current"])
2898        )
2899
2900        # Get param annotations
2901        if param.get("annotations", None) and isinstance(
2902            param.get("annotations", None), str
2903        ):
2904            log.debug(param.get("annotations", None))
2905            param_annotation_list = param.get("annotations").split(",")
2906        else:
2907            param_annotation_list = []
2908
2909        # Each tools param
2910        if param.get("annotation_parquet", None) != None:
2911            log.debug(
2912                f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}"""
2913            )
2914            if isinstance(param.get("annotation_parquet", None), list):
2915                param_annotation_list.append(",".join(param.get("annotation_parquet")))
2916            else:
2917                param_annotation_list.append(param.get("annotation_parquet"))
2918        if param.get("annotation_snpsift", None) != None:
2919            if isinstance(param.get("annotation_snpsift", None), list):
2920                param_annotation_list.append(
2921                    "snpsift:"
2922                    + "+".join(param.get("annotation_snpsift")).replace(",", "+")
2923                )
2924            else:
2925                param_annotation_list.append(
2926                    "snpsift:" + param.get("annotation_snpsift").replace(",", "+")
2927                )
2928        if param.get("annotation_snpeff", None) != None:
2929            param_annotation_list.append("snpeff:" + param.get("annotation_snpeff"))
2930        if param.get("annotation_bcftools", None) != None:
2931            if isinstance(param.get("annotation_bcftools", None), list):
2932                param_annotation_list.append(
2933                    "bcftools:"
2934                    + "+".join(param.get("annotation_bcftools")).replace(",", "+")
2935                )
2936            else:
2937                param_annotation_list.append(
2938                    "bcftools:" + param.get("annotation_bcftools").replace(",", "+")
2939                )
2940        if param.get("annotation_annovar", None) != None:
2941            param_annotation_list.append("annovar:" + param.get("annotation_annovar"))
2942        if param.get("annotation_exomiser", None) != None:
2943            param_annotation_list.append("exomiser:" + param.get("annotation_exomiser"))
2944        if param.get("annotation_splice", None) != None:
2945            param_annotation_list.append("splice:" + param.get("annotation_splice"))
2946
2947        # Merge param annotations list
2948        param["annotations"] = ",".join(param_annotation_list)
2949
2950        # debug
2951        log.debug(f"param_annotations={param['annotations']}")
2952
2953        if param.get("annotations"):
2954
2955            # Log
2956            # log.info("Annotations - Check annotation parameters")
2957
2958            if not "annotation" in param:
2959                param["annotation"] = {}
2960
2961            # List of annotations parameters
2962            annotations_list_input = {}
2963            if isinstance(param.get("annotations", None), str):
2964                annotation_file_list = [
2965                    value for value in param.get("annotations", "").split(",")
2966                ]
2967                for annotation_file in annotation_file_list:
2968                    annotations_list_input[annotation_file.strip()] = {"INFO": None}
2969            else:
2970                annotations_list_input = param.get("annotations", {})
2971
2972            log.info(f"Quick Annotations:")
2973            for annotation_key in list(annotations_list_input.keys()):
2974                log.info(f"   {annotation_key}")
2975
2976            # List of annotations and associated fields
2977            annotations_list = {}
2978
2979            for annotation_file in annotations_list_input:
2980
2981                # Explode annotations if ALL
2982                if (
2983                    annotation_file.upper() == "ALL"
2984                    or annotation_file.upper().startswith("ALL:")
2985                ):
2986
2987                    # check ALL parameters (formats, releases)
2988                    annotation_file_split = annotation_file.split(":")
2989                    database_formats = "parquet"
2990                    database_releases = "current"
2991                    for annotation_file_option in annotation_file_split[1:]:
2992                        database_all_options_split = annotation_file_option.split("=")
2993                        if database_all_options_split[0] == "format":
2994                            database_formats = database_all_options_split[1].split("+")
2995                        if database_all_options_split[0] == "release":
2996                            database_releases = database_all_options_split[1].split("+")
2997
2998                    # Scan for availabled databases
2999                    databases_infos_dict = self.scan_databases(
3000                        database_formats=database_formats,
3001                        database_releases=database_releases,
3002                    )
3003
3004                    # Add found databases in annotation parameters
3005                    for database_infos in databases_infos_dict.keys():
3006                        annotations_list[database_infos] = {"INFO": None}
3007
3008                else:
3009                    annotations_list[annotation_file] = annotations_list_input[
3010                        annotation_file
3011                    ]
3012
3013            # Check each databases
3014            if len(annotations_list):
3015
3016                log.info(
3017                    f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..."
3018                )
3019
3020                for annotation_file in annotations_list:
3021
3022                    # Init
3023                    annotations = annotations_list.get(annotation_file, None)
3024
3025                    # Annotation snpEff
3026                    if annotation_file.startswith("snpeff"):
3027
3028                        log.debug(f"Quick Annotation snpEff")
3029
3030                        if "snpeff" not in param["annotation"]:
3031                            param["annotation"]["snpeff"] = {}
3032
3033                        if "options" not in param["annotation"]["snpeff"]:
3034                            param["annotation"]["snpeff"]["options"] = ""
3035
3036                        # snpEff options in annotations
3037                        param["annotation"]["snpeff"]["options"] = "".join(
3038                            annotation_file.split(":")[1:]
3039                        )
3040
3041                    # Annotation Annovar
3042                    elif annotation_file.startswith("annovar"):
3043
3044                        log.debug(f"Quick Annotation Annovar")
3045
3046                        if "annovar" not in param["annotation"]:
3047                            param["annotation"]["annovar"] = {}
3048
3049                        if "annotations" not in param["annotation"]["annovar"]:
3050                            param["annotation"]["annovar"]["annotations"] = {}
3051
3052                        # Options
3053                        annotation_file_split = annotation_file.split(":")
3054                        for annotation_file_annotation in annotation_file_split[1:]:
3055                            if annotation_file_annotation:
3056                                param["annotation"]["annovar"]["annotations"][
3057                                    annotation_file_annotation
3058                                ] = annotations
3059
3060                    # Annotation Exomiser
3061                    elif annotation_file.startswith("exomiser"):
3062
3063                        log.debug(f"Quick Annotation Exomiser")
3064
3065                        param["annotation"]["exomiser"] = params_string_to_dict(
3066                            annotation_file
3067                        )
3068
3069                    # Annotation Splice
3070                    elif annotation_file.startswith("splice"):
3071
3072                        log.debug(f"Quick Annotation Splice")
3073
3074                        param["annotation"]["splice"] = params_string_to_dict(
3075                            annotation_file
3076                        )
3077
3078                    # Annotation Parquet or BCFTOOLS
3079                    else:
3080
3081                        # Tools detection
3082                        if annotation_file.startswith("bcftools:"):
3083                            annotation_tool_initial = "bcftools"
3084                            annotation_file = ":".join(annotation_file.split(":")[1:])
3085                        elif annotation_file.startswith("snpsift:"):
3086                            annotation_tool_initial = "snpsift"
3087                            annotation_file = ":".join(annotation_file.split(":")[1:])
3088                        elif annotation_file.startswith("bigwig:"):
3089                            annotation_tool_initial = "bigwig"
3090                            annotation_file = ":".join(annotation_file.split(":")[1:])
3091                        else:
3092                            annotation_tool_initial = None
3093
3094                        # list of files
3095                        annotation_file_list = annotation_file.replace("+", ":").split(
3096                            ":"
3097                        )
3098
3099                        for annotation_file in annotation_file_list:
3100
3101                            if annotation_file:
3102
3103                                # Annotation tool initial
3104                                annotation_tool = annotation_tool_initial
3105
3106                                # Find file
3107                                annotation_file_found = None
3108
3109                                if os.path.exists(annotation_file):
3110                                    annotation_file_found = annotation_file
3111                                elif os.path.exists(full_path(annotation_file)):
3112                                    annotation_file_found = full_path(annotation_file)
3113                                else:
3114                                    # Find within assembly folders
3115                                    for annotations_database in annotations_databases:
3116                                        found_files = find_all(
3117                                            annotation_file,
3118                                            os.path.join(
3119                                                annotations_database, assembly
3120                                            ),
3121                                        )
3122                                        if len(found_files) > 0:
3123                                            annotation_file_found = found_files[0]
3124                                            break
3125                                    if not annotation_file_found and not assembly:
3126                                        # Find within folders
3127                                        for (
3128                                            annotations_database
3129                                        ) in annotations_databases:
3130                                            found_files = find_all(
3131                                                annotation_file, annotations_database
3132                                            )
3133                                            if len(found_files) > 0:
3134                                                annotation_file_found = found_files[0]
3135                                                break
3136                                log.debug(
3137                                    f"for {annotation_file} annotation_file_found={annotation_file_found}"
3138                                )
3139
3140                                # Full path
3141                                annotation_file_found = full_path(annotation_file_found)
3142
3143                                if annotation_file_found:
3144
3145                                    database = Database(database=annotation_file_found)
3146                                    quick_annotation_format = database.get_format()
3147                                    quick_annotation_is_compressed = (
3148                                        database.is_compressed()
3149                                    )
3150                                    quick_annotation_is_indexed = os.path.exists(
3151                                        f"{annotation_file_found}.tbi"
3152                                    )
3153                                    bcftools_preference = False
3154
3155                                    # Check Annotation Tool
3156                                    if not annotation_tool:
3157                                        if (
3158                                            bcftools_preference
3159                                            and quick_annotation_format
3160                                            in ["vcf", "bed"]
3161                                            and quick_annotation_is_compressed
3162                                            and quick_annotation_is_indexed
3163                                        ):
3164                                            annotation_tool = "bcftools"
3165                                        elif quick_annotation_format in [
3166                                            "vcf",
3167                                            "bed",
3168                                            "tsv",
3169                                            "tsv",
3170                                            "csv",
3171                                            "json",
3172                                            "tbl",
3173                                            "parquet",
3174                                            "duckdb",
3175                                        ]:
3176                                            annotation_tool = "parquet"
3177                                        elif quick_annotation_format in ["bw"]:
3178                                            annotation_tool = "bigwig"
3179                                        else:
3180                                            log.error(
3181                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
3182                                            )
3183                                            raise ValueError(
3184                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
3185                                            )
3186
3187                                    log.debug(
3188                                        f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}"
3189                                    )
3190
3191                                    # Annotation Tool dispatch
3192                                    if annotation_tool:
3193                                        if annotation_tool not in param["annotation"]:
3194                                            param["annotation"][annotation_tool] = {}
3195                                        if (
3196                                            "annotations"
3197                                            not in param["annotation"][annotation_tool]
3198                                        ):
3199                                            param["annotation"][annotation_tool][
3200                                                "annotations"
3201                                            ] = {}
3202                                        param["annotation"][annotation_tool][
3203                                            "annotations"
3204                                        ][annotation_file_found] = annotations
3205
3206                                else:
3207                                    log.warning(
3208                                        f"Quick Annotation File {annotation_file} does NOT exist"
3209                                    )
3210
3211                self.set_param(param)
3212
3213        if param.get("annotation", None):
3214            log.info("Annotations")
3215            if param.get("annotation", {}).get("parquet", None):
3216                log.info("Annotations 'parquet'...")
3217                self.annotation_parquet()
3218            if param.get("annotation", {}).get("bcftools", None):
3219                log.info("Annotations 'bcftools'...")
3220                self.annotation_bcftools()
3221            if param.get("annotation", {}).get("snpsift", None):
3222                log.info("Annotations 'snpsift'...")
3223                self.annotation_snpsift()
3224            if param.get("annotation", {}).get("bigwig", None):
3225                log.info("Annotations 'bigwig'...")
3226                self.annotation_bigwig()
3227            if param.get("annotation", {}).get("annovar", None):
3228                log.info("Annotations 'annovar'...")
3229                self.annotation_annovar()
3230            if param.get("annotation", {}).get("snpeff", None):
3231                log.info("Annotations 'snpeff'...")
3232                self.annotation_snpeff()
3233            if param.get("annotation", {}).get("exomiser", None) is not None:
3234                log.info("Annotations 'exomiser'...")
3235                self.annotation_exomiser()
3236            if param.get("annotation", {}).get("splice", None) is not None:
3237                log.info("Annotations 'splice' ...")
3238                self.annotation_splice()
3239
3240        # Explode INFOS fields into table fields
3241        if self.get_explode_infos():
3242            self.explode_infos(
3243                prefix=self.get_explode_infos_prefix(),
3244                fields=self.get_explode_infos_fields(),
3245                force=True,
3246            )

It annotates the VCF file with the annotations specified in the config file.

def annotation_bigwig(self, threads: int = None) -> None:
3248    def annotation_bigwig(self, threads: int = None) -> None:
3249        """
3250        The function `annotation_bigwig` annotates variants in a VCF file using bigwig databases.
3251
3252        :param threads: The `threads` parameter in the `annotation_bigwig` method is used to specify the
3253        number of threads to be used for parallel processing during the annotation process. If the
3254        `threads` parameter is not provided, the method will attempt to determine the optimal number of
3255        threads to use based on the system configuration
3256        :type threads: int
3257        :return: True
3258        """
3259
3260        # DEBUG
3261        log.debug("Start annotation with bigwig databases")
3262
3263        # # Threads
3264        # if not threads:
3265        #     threads = self.get_threads()
3266        # log.debug("Threads: " + str(threads))
3267
3268        # Config
3269        config = self.get_config()
3270        log.debug("Config: " + str(config))
3271
3272        # Config - BCFTools databases folders
3273        databases_folders = set(
3274            self.get_config()
3275            .get("folders", {})
3276            .get("databases", {})
3277            .get("annotations", ["."])
3278            + self.get_config()
3279            .get("folders", {})
3280            .get("databases", {})
3281            .get("bigwig", ["."])
3282        )
3283        log.debug("Databases annotations: " + str(databases_folders))
3284
3285        # Param
3286        annotations = (
3287            self.get_param()
3288            .get("annotation", {})
3289            .get("bigwig", {})
3290            .get("annotations", None)
3291        )
3292        log.debug("Annotations: " + str(annotations))
3293
3294        # Assembly
3295        assembly = self.get_param().get(
3296            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
3297        )
3298
3299        # Data
3300        table_variants = self.get_table_variants()
3301
3302        # Check if not empty
3303        log.debug("Check if not empty")
3304        sql_query_chromosomes = (
3305            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
3306        )
3307        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
3308        if not sql_query_chromosomes_df["count"][0]:
3309            log.info(f"VCF empty")
3310            return
3311
3312        # VCF header
3313        vcf_reader = self.get_header()
3314        log.debug("Initial header: " + str(vcf_reader.infos))
3315
3316        # Existing annotations
3317        for vcf_annotation in self.get_header().infos:
3318
3319            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
3320            log.debug(
3321                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
3322            )
3323
3324        if annotations:
3325
3326            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
3327
3328                # Export VCF file
3329                tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz")
3330
3331                # annotation_bigwig_config
3332                annotation_bigwig_config_list = []
3333
3334                for annotation in annotations:
3335                    annotation_fields = annotations[annotation]
3336
3337                    # Annotation Name
3338                    annotation_name = os.path.basename(annotation)
3339
3340                    if not annotation_fields:
3341                        annotation_fields = {"INFO": None}
3342
3343                    log.debug(f"Annotation '{annotation_name}'")
3344                    log.debug(
3345                        f"Annotation '{annotation_name}' - fields: {annotation_fields}"
3346                    )
3347
3348                    # Create Database
3349                    database = Database(
3350                        database=annotation,
3351                        databases_folders=databases_folders,
3352                        assembly=assembly,
3353                    )
3354
3355                    # Find files
3356                    db_file = database.get_database()
3357                    db_file = full_path(db_file)
3358                    db_hdr_file = database.get_header_file()
3359                    db_hdr_file = full_path(db_hdr_file)
3360                    db_file_type = database.get_format()
3361
3362                    # If db_file is http ?
3363                    if database.get_database().startswith("http"):
3364
3365                        # Datbase is HTTP URL
3366                        db_file_is_http = True
3367
3368                        # DB file keep as URL
3369                        db_file = database.get_database()
3370                        log.warning(
3371                            f"Annotations 'bigwig' database '{db_file}' - is an HTTP URL (experimental)"
3372                        )
3373
3374                        # Retrieve automatic annotation field name
3375                        annotation_field = clean_annotation_field(
3376                            os.path.basename(db_file).replace(".bw", "")
3377                        )
3378                        log.debug(
3379                            f"Create header file with annotation field '{annotation_field}' is an HTTP URL"
3380                        )
3381
3382                        # Create automatic header file
3383                        db_hdr_file = os.path.join(tmp_dir, "header.hdr")
3384                        with open(db_hdr_file, "w") as f:
3385                            f.write("##fileformat=VCFv4.2\n")
3386                            f.write(
3387                                f"""##INFO=<ID={annotation_field},Number=.,Type=Float,Description="{annotation_field} annotation from {db_file}">\n"""
3388                            )
3389                            f.write(f"#CHROM	START	END	{annotation_field}\n")
3390
3391                    else:
3392
3393                        # Datbase is NOT HTTP URL
3394                        db_file_is_http = False
3395
3396                    # Check index - try to create if not exists
3397                    if (
3398                        db_file is None
3399                        or db_hdr_file is None
3400                        or (not os.path.exists(db_file) and not db_file_is_http)
3401                        or not os.path.exists(db_hdr_file)
3402                        or not db_file_type in ["bw"]
3403                    ):
3404                        # if False:
3405                        log.error("Annotation failed: database not valid")
3406                        log.error(f"Annotation annotation file: {db_file}")
3407                        log.error(f"Annotation annotation file type: {db_file_type}")
3408                        log.error(f"Annotation annotation header: {db_hdr_file}")
3409                        raise ValueError(
3410                            f"Annotation failed: database not valid - annotation file {db_file} / annotation file type {db_file_type} / annotation header {db_hdr_file}"
3411                        )
3412                    else:
3413
3414                        # Log
3415                        log.debug(
3416                            f"Annotation '{annotation}' - file: "
3417                            + str(db_file)
3418                            + " and "
3419                            + str(db_hdr_file)
3420                        )
3421
3422                        # Load header as VCF object
3423                        db_hdr_vcf = Variants(input=db_hdr_file)
3424                        db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
3425                        log.debug(
3426                            "Annotation database header: "
3427                            + str(db_hdr_vcf_header_infos)
3428                        )
3429
3430                        # For all fields in database
3431                        annotation_fields_full = False
3432                        if "ALL" in annotation_fields or "INFO" in annotation_fields:
3433                            annotation_fields = {
3434                                key: key for key in db_hdr_vcf_header_infos
3435                            }
3436                            log.debug(
3437                                "Annotation database header - All annotations added: "
3438                                + str(annotation_fields)
3439                            )
3440                            annotation_fields_full = True
3441
3442                        # Init
3443                        cyvcf2_header_rename_dict = {}
3444                        cyvcf2_header_list = []
3445                        cyvcf2_header_indexes = {}
3446
3447                        # process annotation fields
3448                        for annotation_field in annotation_fields:
3449
3450                            # New annotation name
3451                            annotation_field_new = annotation_fields[annotation_field]
3452
3453                            # Check annotation field and index in header
3454                            if (
3455                                annotation_field
3456                                in db_hdr_vcf.get_header_columns_as_list()
3457                            ):
3458                                annotation_field_index = (
3459                                    db_hdr_vcf.get_header_columns_as_list().index(
3460                                        annotation_field
3461                                    )
3462                                    - 3
3463                                )
3464                                cyvcf2_header_indexes[annotation_field_new] = (
3465                                    annotation_field_index
3466                                )
3467                            else:
3468                                msg_err = f"Database '{db_file}' does NOT contain annotation field '{annotation_field}'"
3469                                log.error(msg_err)
3470                                raise ValueError(msg_err)
3471
3472                            # Append annotation field in cyvcf2 header list
3473                            cyvcf2_header_rename_dict[annotation_field_new] = (
3474                                db_hdr_vcf_header_infos[annotation_field].id
3475                            )
3476                            cyvcf2_header_list.append(
3477                                {
3478                                    "ID": annotation_field_new,
3479                                    "Number": db_hdr_vcf_header_infos[
3480                                        annotation_field
3481                                    ].num,
3482                                    "Type": db_hdr_vcf_header_infos[
3483                                        annotation_field
3484                                    ].type,
3485                                    "Description": db_hdr_vcf_header_infos[
3486                                        annotation_field
3487                                    ].desc,
3488                                }
3489                            )
3490
3491                            # Add header on VCF
3492                            vcf_reader.infos[annotation_field_new] = vcf.parser._Info(
3493                                annotation_field_new,
3494                                db_hdr_vcf_header_infos[annotation_field].num,
3495                                db_hdr_vcf_header_infos[annotation_field].type,
3496                                db_hdr_vcf_header_infos[annotation_field].desc,
3497                                "HOWARD BigWig annotation",
3498                                "unknown",
3499                                self.code_type_map[
3500                                    db_hdr_vcf_header_infos[annotation_field].type
3501                                ],
3502                            )
3503
3504                        # Load bigwig database
3505                        bw_db = pyBigWig.open(db_file)
3506                        if bw_db.isBigWig():
3507                            log.debug(f"Database '{db_file}' is in 'BigWig' format")
3508                        else:
3509                            msg_err = f"Database '{db_file}' is NOT in 'BigWig' format"
3510                            log.error(msg_err)
3511                            raise ValueError(msg_err)
3512
3513                        annotation_bigwig_config_list.append(
3514                            {
3515                                "db_file": db_file,
3516                                "bw_db": bw_db,
3517                                "cyvcf2_header_rename_dict": cyvcf2_header_rename_dict,
3518                                "cyvcf2_header_list": cyvcf2_header_list,
3519                                "cyvcf2_header_indexes": cyvcf2_header_indexes,
3520                            }
3521                        )
3522
3523                # Annotate
3524                if annotation_bigwig_config_list:
3525
3526                    # Annotation config
3527                    log.debug(
3528                        f"annotation_bigwig_config={annotation_bigwig_config_list}"
3529                    )
3530
3531                    # Export VCF file
3532                    self.export_variant_vcf(
3533                        vcf_file=tmp_vcf_name,
3534                        remove_info=True,
3535                        add_samples=False,
3536                        index=True,
3537                    )
3538
3539                    # Load input tmp file
3540                    input_vcf = cyvcf2.VCF(tmp_vcf_name)
3541
3542                    # Add header in input file
3543                    for annotation_bigwig_config in annotation_bigwig_config_list:
3544                        for cyvcf2_header_field in annotation_bigwig_config.get(
3545                            "cyvcf2_header_list", []
3546                        ):
3547                            log.info(
3548                                f"Annotations 'bigwig' database '{os.path.basename(annotation_bigwig_config.get('db_file'))}' - annotation field '{annotation_bigwig_config.get('cyvcf2_header_rename_dict',{}).get(cyvcf2_header_field.get('ID','Unknown'))}' -> '{cyvcf2_header_field.get('ID')}'"
3549                            )
3550                            input_vcf.add_info_to_header(cyvcf2_header_field)
3551
3552                    # Create output VCF file
3553                    output_vcf_file = os.path.join(tmp_dir, "output.vcf.gz")
3554                    output_vcf = cyvcf2.Writer(output_vcf_file, input_vcf)
3555
3556                    # Fetch variants
3557                    log.info(f"Annotations 'bigwig' start...")
3558                    for variant in input_vcf:
3559
3560                        for annotation_bigwig_config in annotation_bigwig_config_list:
3561
3562                            # DB and indexes
3563                            bw_db = annotation_bigwig_config.get("bw_db", None)
3564                            cyvcf2_header_indexes = annotation_bigwig_config.get(
3565                                "cyvcf2_header_indexes", None
3566                            )
3567
3568                            # Retrieve value from chrom pos
3569                            res = bw_db.values(
3570                                variant.CHROM, variant.POS - 1, variant.POS
3571                            )
3572
3573                            # For each annotation fields (and indexes)
3574                            for cyvcf2_header_index in cyvcf2_header_indexes:
3575
3576                                # If value is NOT nNone
3577                                if not np.isnan(
3578                                    res[cyvcf2_header_indexes[cyvcf2_header_index]]
3579                                ):
3580                                    variant.INFO[cyvcf2_header_index] = res[
3581                                        cyvcf2_header_indexes[cyvcf2_header_index]
3582                                    ]
3583
3584                        # Add record in output file
3585                        output_vcf.write_record(variant)
3586
3587                    # Log
3588                    log.debug(f"Annotation done.")
3589
3590                    # Close and write file
3591                    log.info(f"Annotations 'bigwig' write...")
3592                    output_vcf.close()
3593                    log.debug(f"Write done.")
3594
3595                    # Update variants
3596                    log.info(f"Annotations 'bigwig' update...")
3597                    self.update_from_vcf(output_vcf_file)
3598                    log.debug(f"Update done.")
3599
3600        return True

The function annotation_bigwig annotates variants in a VCF file using bigwig databases.

Parameters
  • threads: The threads parameter in the annotation_bigwig method is used to specify the number of threads to be used for parallel processing during the annotation process. If the threads parameter is not provided, the method will attempt to determine the optimal number of threads to use based on the system configuration
Returns

True

def annotation_snpsift(self, threads: int = None) -> None:
3602    def annotation_snpsift(self, threads: int = None) -> None:
3603        """
3604        This function annotate with bcftools
3605
3606        :param threads: Number of threads to use
3607        :return: the value of the variable "return_value".
3608        """
3609
3610        # DEBUG
3611        log.debug("Start annotation with bcftools databases")
3612
3613        # Threads
3614        if not threads:
3615            threads = self.get_threads()
3616        log.debug("Threads: " + str(threads))
3617
3618        # Config
3619        config = self.get_config()
3620        log.debug("Config: " + str(config))
3621
3622        # Config - snpSift
3623        snpsift_bin_command = get_bin_command(
3624            bin="SnpSift.jar",
3625            tool="snpsift",
3626            bin_type="jar",
3627            config=config,
3628            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
3629        )
3630        if not snpsift_bin_command:
3631            msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'"
3632            log.error(msg_err)
3633            raise ValueError(msg_err)
3634
3635        # Config - bcftools
3636        bcftools_bin_command = get_bin_command(
3637            bin="bcftools",
3638            tool="bcftools",
3639            bin_type="bin",
3640            config=config,
3641            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
3642        )
3643        if not bcftools_bin_command:
3644            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
3645            log.error(msg_err)
3646            raise ValueError(msg_err)
3647
3648        # Config - BCFTools databases folders
3649        databases_folders = set(
3650            self.get_config()
3651            .get("folders", {})
3652            .get("databases", {})
3653            .get("annotations", ["."])
3654            + self.get_config()
3655            .get("folders", {})
3656            .get("databases", {})
3657            .get("bcftools", ["."])
3658        )
3659        log.debug("Databases annotations: " + str(databases_folders))
3660
3661        # Param
3662        annotations = (
3663            self.get_param()
3664            .get("annotation", {})
3665            .get("snpsift", {})
3666            .get("annotations", None)
3667        )
3668        log.debug("Annotations: " + str(annotations))
3669
3670        # Assembly
3671        assembly = self.get_param().get(
3672            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
3673        )
3674
3675        # Data
3676        table_variants = self.get_table_variants()
3677
3678        # Check if not empty
3679        log.debug("Check if not empty")
3680        sql_query_chromosomes = (
3681            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
3682        )
3683        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
3684        if not sql_query_chromosomes_df["count"][0]:
3685            log.info(f"VCF empty")
3686            return
3687
3688        # VCF header
3689        vcf_reader = self.get_header()
3690        log.debug("Initial header: " + str(vcf_reader.infos))
3691
3692        # Existing annotations
3693        for vcf_annotation in self.get_header().infos:
3694
3695            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
3696            log.debug(
3697                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
3698            )
3699
3700        if annotations:
3701
3702            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
3703
3704                # Export VCF file
3705                tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz")
3706
3707                # Init
3708                commands = {}
3709
3710                for annotation in annotations:
3711                    annotation_fields = annotations[annotation]
3712
3713                    # Annotation Name
3714                    annotation_name = os.path.basename(annotation)
3715
3716                    if not annotation_fields:
3717                        annotation_fields = {"INFO": None}
3718
3719                    log.debug(f"Annotation '{annotation_name}'")
3720                    log.debug(
3721                        f"Annotation '{annotation_name}' - fields: {annotation_fields}"
3722                    )
3723
3724                    # Create Database
3725                    database = Database(
3726                        database=annotation,
3727                        databases_folders=databases_folders,
3728                        assembly=assembly,
3729                    )
3730
3731                    # Find files
3732                    db_file = database.get_database()
3733                    db_file = full_path(db_file)
3734                    db_hdr_file = database.get_header_file()
3735                    db_hdr_file = full_path(db_hdr_file)
3736                    db_file_type = database.get_format()
3737                    db_tbi_file = f"{db_file}.tbi"
3738                    db_file_compressed = database.is_compressed()
3739
3740                    # Check if compressed
3741                    if not db_file_compressed:
3742                        log.error(
3743                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
3744                        )
3745                        raise ValueError(
3746                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
3747                        )
3748
3749                    # Check if indexed
3750                    if not os.path.exists(db_tbi_file):
3751                        log.error(
3752                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
3753                        )
3754                        raise ValueError(
3755                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
3756                        )
3757
3758                    # Check index - try to create if not exists
3759                    if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
3760                        log.error("Annotation failed: database not valid")
3761                        log.error(f"Annotation annotation file: {db_file}")
3762                        log.error(f"Annotation annotation header: {db_hdr_file}")
3763                        log.error(f"Annotation annotation index: {db_tbi_file}")
3764                        raise ValueError(
3765                            f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
3766                        )
3767                    else:
3768
3769                        log.debug(
3770                            f"Annotation '{annotation}' - file: "
3771                            + str(db_file)
3772                            + " and "
3773                            + str(db_hdr_file)
3774                        )
3775
3776                        # Load header as VCF object
3777                        db_hdr_vcf = Variants(input=db_hdr_file)
3778                        db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
3779                        log.debug(
3780                            "Annotation database header: "
3781                            + str(db_hdr_vcf_header_infos)
3782                        )
3783
3784                        # For all fields in database
3785                        annotation_fields_full = False
3786                        if "ALL" in annotation_fields or "INFO" in annotation_fields:
3787                            annotation_fields = {
3788                                key: key for key in db_hdr_vcf_header_infos
3789                            }
3790                            log.debug(
3791                                "Annotation database header - All annotations added: "
3792                                + str(annotation_fields)
3793                            )
3794                            annotation_fields_full = True
3795
3796                        # # Create file for field rename
3797                        # log.debug("Create file for field rename")
3798                        # tmp_rename = NamedTemporaryFile(
3799                        #     prefix=self.get_prefix(),
3800                        #     dir=self.get_tmp_dir(),
3801                        #     suffix=".rename",
3802                        #     delete=False,
3803                        # )
3804                        # tmp_rename_name = tmp_rename.name
3805                        # tmp_files.append(tmp_rename_name)
3806
3807                        # Number of fields
3808                        nb_annotation_field = 0
3809                        annotation_list = []
3810                        annotation_infos_rename_list = []
3811
3812                        for annotation_field in annotation_fields:
3813
3814                            # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
3815                            annotation_fields_new_name = annotation_fields.get(
3816                                annotation_field, annotation_field
3817                            )
3818                            if not annotation_fields_new_name:
3819                                annotation_fields_new_name = annotation_field
3820
3821                            # Check if field is in DB and if field is not elready in input data
3822                            if (
3823                                annotation_field in db_hdr_vcf.get_header().infos
3824                                and annotation_fields_new_name
3825                                not in self.get_header().infos
3826                            ):
3827
3828                                log.info(
3829                                    f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
3830                                )
3831
3832                                # BCFTools annotate param to rename fields
3833                                if annotation_field != annotation_fields_new_name:
3834                                    annotation_infos_rename_list.append(
3835                                        f"{annotation_fields_new_name}:=INFO/{annotation_field}"
3836                                    )
3837
3838                                # Add INFO field to header
3839                                db_hdr_vcf_header_infos_number = (
3840                                    db_hdr_vcf_header_infos[annotation_field].num or "."
3841                                )
3842                                db_hdr_vcf_header_infos_type = (
3843                                    db_hdr_vcf_header_infos[annotation_field].type
3844                                    or "String"
3845                                )
3846                                db_hdr_vcf_header_infos_description = (
3847                                    db_hdr_vcf_header_infos[annotation_field].desc
3848                                    or f"{annotation_field} description"
3849                                )
3850                                db_hdr_vcf_header_infos_source = (
3851                                    db_hdr_vcf_header_infos[annotation_field].source
3852                                    or "unknown"
3853                                )
3854                                db_hdr_vcf_header_infos_version = (
3855                                    db_hdr_vcf_header_infos[annotation_field].version
3856                                    or "unknown"
3857                                )
3858
3859                                vcf_reader.infos[annotation_fields_new_name] = (
3860                                    vcf.parser._Info(
3861                                        annotation_fields_new_name,
3862                                        db_hdr_vcf_header_infos_number,
3863                                        db_hdr_vcf_header_infos_type,
3864                                        db_hdr_vcf_header_infos_description,
3865                                        db_hdr_vcf_header_infos_source,
3866                                        db_hdr_vcf_header_infos_version,
3867                                        self.code_type_map[
3868                                            db_hdr_vcf_header_infos_type
3869                                        ],
3870                                    )
3871                                )
3872
3873                                annotation_list.append(annotation_field)
3874
3875                                nb_annotation_field += 1
3876
3877                            else:
3878
3879                                if (
3880                                    annotation_field
3881                                    not in db_hdr_vcf.get_header().infos
3882                                ):
3883                                    log.warning(
3884                                        f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file"
3885                                    )
3886                                if (
3887                                    annotation_fields_new_name
3888                                    in self.get_header().infos
3889                                ):
3890                                    log.warning(
3891                                        f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)"
3892                                    )
3893
3894                        log.info(
3895                            f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
3896                        )
3897
3898                        annotation_infos = ",".join(annotation_list)
3899
3900                        if annotation_infos != "":
3901
3902                            # Annotated VCF (and error file)
3903                            tmp_annotation_vcf_name = os.path.join(
3904                                tmp_dir, os.path.basename(annotation) + ".vcf.gz"
3905                            )
3906                            tmp_annotation_vcf_name_err = (
3907                                tmp_annotation_vcf_name + ".err"
3908                            )
3909
3910                            # Add fields to annotate
3911                            if not annotation_fields_full:
3912                                annotation_infos_option = f"-info {annotation_infos}"
3913                            else:
3914                                annotation_infos_option = ""
3915
3916                            # Info fields rename
3917                            if annotation_infos_rename_list:
3918                                annotation_infos_rename = " -c " + ",".join(
3919                                    annotation_infos_rename_list
3920                                )
3921                            else:
3922                                annotation_infos_rename = ""
3923
3924                            # Annotate command
3925                            command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
3926
3927                            # Add command
3928                            commands[command_annotate] = tmp_annotation_vcf_name
3929
3930                if commands:
3931
3932                    # Export VCF file
3933                    self.export_variant_vcf(
3934                        vcf_file=tmp_vcf_name,
3935                        remove_info=True,
3936                        add_samples=False,
3937                        index=True,
3938                    )
3939                    shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf")
3940
3941                    # Num command
3942                    nb_command = 0
3943
3944                    # Annotate
3945                    for command_annotate in commands:
3946                        nb_command += 1
3947                        log.info(
3948                            f"Annotation - Annotate [{nb_command}/{len(commands)}]..."
3949                        )
3950                        log.debug(f"command_annotate={command_annotate}")
3951                        run_parallel_commands([command_annotate], threads)
3952
3953                        # Debug
3954                        shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf")
3955
3956                        # Update variants
3957                        log.info(
3958                            f"Annotation - Updating [{nb_command}/{len(commands)}]..."
3959                        )
3960                        self.update_from_vcf(commands[command_annotate])

This function annotate with bcftools

Parameters
  • threads: Number of threads to use
Returns

the value of the variable "return_value".

def annotation_bcftools(self, threads: int = None) -> None:
3962    def annotation_bcftools(self, threads: int = None) -> None:
3963        """
3964        This function annotate with bcftools
3965
3966        :param threads: Number of threads to use
3967        :return: the value of the variable "return_value".
3968        """
3969
3970        # DEBUG
3971        log.debug("Start annotation with bcftools databases")
3972
3973        # Threads
3974        if not threads:
3975            threads = self.get_threads()
3976        log.debug("Threads: " + str(threads))
3977
3978        # Config
3979        config = self.get_config()
3980        log.debug("Config: " + str(config))
3981
3982        # DEBUG
3983        delete_tmp = True
3984        if self.get_config().get("verbosity", "warning") in ["debug"]:
3985            delete_tmp = False
3986            log.debug("Delete tmp files/folders: " + str(delete_tmp))
3987
3988        # Config - BCFTools bin command
3989        bcftools_bin_command = get_bin_command(
3990            bin="bcftools",
3991            tool="bcftools",
3992            bin_type="bin",
3993            config=config,
3994            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
3995        )
3996        if not bcftools_bin_command:
3997            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
3998            log.error(msg_err)
3999            raise ValueError(msg_err)
4000
4001        # Config - BCFTools databases folders
4002        databases_folders = set(
4003            self.get_config()
4004            .get("folders", {})
4005            .get("databases", {})
4006            .get("annotations", ["."])
4007            + self.get_config()
4008            .get("folders", {})
4009            .get("databases", {})
4010            .get("bcftools", ["."])
4011        )
4012        log.debug("Databases annotations: " + str(databases_folders))
4013
4014        # Param
4015        annotations = (
4016            self.get_param()
4017            .get("annotation", {})
4018            .get("bcftools", {})
4019            .get("annotations", None)
4020        )
4021        log.debug("Annotations: " + str(annotations))
4022
4023        # Assembly
4024        assembly = self.get_param().get(
4025            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
4026        )
4027
4028        # Data
4029        table_variants = self.get_table_variants()
4030
4031        # Check if not empty
4032        log.debug("Check if not empty")
4033        sql_query_chromosomes = (
4034            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
4035        )
4036        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
4037        if not sql_query_chromosomes_df["count"][0]:
4038            log.info(f"VCF empty")
4039            return
4040
4041        # Export in VCF
4042        log.debug("Create initial file to annotate")
4043        tmp_vcf = NamedTemporaryFile(
4044            prefix=self.get_prefix(),
4045            dir=self.get_tmp_dir(),
4046            suffix=".vcf.gz",
4047            delete=False,
4048        )
4049        tmp_vcf_name = tmp_vcf.name
4050
4051        # VCF header
4052        vcf_reader = self.get_header()
4053        log.debug("Initial header: " + str(vcf_reader.infos))
4054
4055        # Existing annotations
4056        for vcf_annotation in self.get_header().infos:
4057
4058            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
4059            log.debug(
4060                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
4061            )
4062
4063        if annotations:
4064
4065            tmp_ann_vcf_list = []
4066            commands = []
4067            tmp_files = []
4068            err_files = []
4069
4070            for annotation in annotations:
4071                annotation_fields = annotations[annotation]
4072
4073                # Annotation Name
4074                annotation_name = os.path.basename(annotation)
4075
4076                if not annotation_fields:
4077                    annotation_fields = {"INFO": None}
4078
4079                log.debug(f"Annotation '{annotation_name}'")
4080                log.debug(
4081                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
4082                )
4083
4084                # Create Database
4085                database = Database(
4086                    database=annotation,
4087                    databases_folders=databases_folders,
4088                    assembly=assembly,
4089                )
4090
4091                # Find files
4092                db_file = database.get_database()
4093                db_file = full_path(db_file)
4094                db_hdr_file = database.get_header_file()
4095                db_hdr_file = full_path(db_hdr_file)
4096                db_file_type = database.get_format()
4097                db_tbi_file = f"{db_file}.tbi"
4098                db_file_compressed = database.is_compressed()
4099
4100                # Check if compressed
4101                if not db_file_compressed:
4102                    log.error(
4103                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
4104                    )
4105                    raise ValueError(
4106                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
4107                    )
4108
4109                # Check if indexed
4110                if not os.path.exists(db_tbi_file):
4111                    log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file")
4112                    raise ValueError(
4113                        f"Annotation '{annotation}' - {db_file} NOT indexed file"
4114                    )
4115
4116                # Check index - try to create if not exists
4117                if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
4118                    log.error("Annotation failed: database not valid")
4119                    log.error(f"Annotation annotation file: {db_file}")
4120                    log.error(f"Annotation annotation header: {db_hdr_file}")
4121                    log.error(f"Annotation annotation index: {db_tbi_file}")
4122                    raise ValueError(
4123                        f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
4124                    )
4125                else:
4126
4127                    log.debug(
4128                        f"Annotation '{annotation}' - file: "
4129                        + str(db_file)
4130                        + " and "
4131                        + str(db_hdr_file)
4132                    )
4133
4134                    # Load header as VCF object
4135                    db_hdr_vcf = Variants(input=db_hdr_file)
4136                    db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
4137                    log.debug(
4138                        "Annotation database header: " + str(db_hdr_vcf_header_infos)
4139                    )
4140
4141                    # For all fields in database
4142                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
4143                        annotation_fields = {
4144                            key: key for key in db_hdr_vcf_header_infos
4145                        }
4146                        log.debug(
4147                            "Annotation database header - All annotations added: "
4148                            + str(annotation_fields)
4149                        )
4150
4151                    # Number of fields
4152                    nb_annotation_field = 0
4153                    annotation_list = []
4154
4155                    for annotation_field in annotation_fields:
4156
4157                        # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
4158                        annotation_fields_new_name = annotation_fields.get(
4159                            annotation_field, annotation_field
4160                        )
4161                        if not annotation_fields_new_name:
4162                            annotation_fields_new_name = annotation_field
4163
4164                        # Check if field is in DB and if field is not elready in input data
4165                        if (
4166                            annotation_field in db_hdr_vcf.get_header().infos
4167                            and annotation_fields_new_name
4168                            not in self.get_header().infos
4169                        ):
4170
4171                            log.info(
4172                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
4173                            )
4174
4175                            # Add INFO field to header
4176                            db_hdr_vcf_header_infos_number = (
4177                                db_hdr_vcf_header_infos[annotation_field].num or "."
4178                            )
4179                            db_hdr_vcf_header_infos_type = (
4180                                db_hdr_vcf_header_infos[annotation_field].type
4181                                or "String"
4182                            )
4183                            db_hdr_vcf_header_infos_description = (
4184                                db_hdr_vcf_header_infos[annotation_field].desc
4185                                or f"{annotation_field} description"
4186                            )
4187                            db_hdr_vcf_header_infos_source = (
4188                                db_hdr_vcf_header_infos[annotation_field].source
4189                                or "unknown"
4190                            )
4191                            db_hdr_vcf_header_infos_version = (
4192                                db_hdr_vcf_header_infos[annotation_field].version
4193                                or "unknown"
4194                            )
4195
4196                            vcf_reader.infos[annotation_fields_new_name] = (
4197                                vcf.parser._Info(
4198                                    annotation_fields_new_name,
4199                                    db_hdr_vcf_header_infos_number,
4200                                    db_hdr_vcf_header_infos_type,
4201                                    db_hdr_vcf_header_infos_description,
4202                                    db_hdr_vcf_header_infos_source,
4203                                    db_hdr_vcf_header_infos_version,
4204                                    self.code_type_map[db_hdr_vcf_header_infos_type],
4205                                )
4206                            )
4207
4208                            # annotation_list.append(annotation_field)
4209                            if annotation_field != annotation_fields_new_name:
4210                                annotation_list.append(
4211                                    f"{annotation_fields_new_name}:=INFO/{annotation_field}"
4212                                )
4213                            else:
4214                                annotation_list.append(annotation_field)
4215
4216                            nb_annotation_field += 1
4217
4218                        else:
4219
4220                            if annotation_field not in db_hdr_vcf.get_header().infos:
4221                                log.warning(
4222                                    f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file"
4223                                )
4224                            if annotation_fields_new_name in self.get_header().infos:
4225                                log.warning(
4226                                    f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
4227                                )
4228
4229                    log.info(
4230                        f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
4231                    )
4232
4233                    annotation_infos = ",".join(annotation_list)
4234
4235                    if annotation_infos != "":
4236
4237                        # Protect header for bcftools (remove "#CHROM" and variants line)
4238                        log.debug("Protect Header file - remove #CHROM line if exists")
4239                        tmp_header_vcf = NamedTemporaryFile(
4240                            prefix=self.get_prefix(),
4241                            dir=self.get_tmp_dir(),
4242                            suffix=".hdr",
4243                            delete=False,
4244                        )
4245                        tmp_header_vcf_name = tmp_header_vcf.name
4246                        tmp_files.append(tmp_header_vcf_name)
4247                        # Command
4248                        if db_hdr_file.endswith(".gz"):
4249                            command_extract_header = f"zcat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
4250                        else:
4251                            command_extract_header = f"cat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
4252                        # Run
4253                        run_parallel_commands([command_extract_header], 1)
4254
4255                        # Find chomosomes
4256                        log.debug("Find chromosomes ")
4257                        sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\""""
4258                        sql_query_chromosomes_df = self.get_query_to_df(
4259                            sql_query_chromosomes
4260                        )
4261                        chomosomes_list = list(sql_query_chromosomes_df["CHROM"])
4262
4263                        log.debug("Chromosomes found: " + str(list(chomosomes_list)))
4264
4265                        # BED columns in the annotation file
4266                        if db_file_type in ["bed"]:
4267                            annotation_infos = "CHROM,POS,POS," + annotation_infos
4268
4269                        for chrom in chomosomes_list:
4270
4271                            # Create BED on initial VCF
4272                            log.debug("Create BED on initial VCF: " + str(tmp_vcf_name))
4273                            tmp_bed = NamedTemporaryFile(
4274                                prefix=self.get_prefix(),
4275                                dir=self.get_tmp_dir(),
4276                                suffix=".bed",
4277                                delete=False,
4278                            )
4279                            tmp_bed_name = tmp_bed.name
4280                            tmp_files.append(tmp_bed_name)
4281
4282                            # Detecte regions
4283                            log.debug(
4284                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..."
4285                            )
4286                            window = 1000000
4287                            sql_query_intervals_for_bed = f"""
4288                                SELECT  \"#CHROM\",
4289                                        CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END,
4290                                        \"POS\"+{window}
4291                                FROM {table_variants} as table_variants
4292                                WHERE table_variants.\"#CHROM\" = '{chrom}'
4293                            """
4294                            regions = self.conn.execute(
4295                                sql_query_intervals_for_bed
4296                            ).fetchall()
4297                            merged_regions = merge_regions(regions)
4298                            log.debug(
4299                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..."
4300                            )
4301
4302                            header = ["#CHROM", "START", "END"]
4303                            with open(tmp_bed_name, "w") as f:
4304                                # Write the header with tab delimiter
4305                                f.write("\t".join(header) + "\n")
4306                                for d in merged_regions:
4307                                    # Write each data row with tab delimiter
4308                                    f.write("\t".join(map(str, d)) + "\n")
4309
4310                            # Tmp files
4311                            tmp_annotation_vcf = NamedTemporaryFile(
4312                                prefix=self.get_prefix(),
4313                                dir=self.get_tmp_dir(),
4314                                suffix=".vcf.gz",
4315                                delete=False,
4316                            )
4317                            tmp_annotation_vcf_name = tmp_annotation_vcf.name
4318                            tmp_files.append(tmp_annotation_vcf_name)
4319                            tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}")
4320                            tmp_annotation_vcf_name_err = (
4321                                tmp_annotation_vcf_name + ".err"
4322                            )
4323                            err_files.append(tmp_annotation_vcf_name_err)
4324
4325                            # Annotate Command
4326                            log.debug(
4327                                f"Annotation '{annotation}' - add bcftools command"
4328                            )
4329
4330                            # Command
4331                            command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
4332
4333                            # Add command
4334                            commands.append(command_annotate)
4335
4336            # if some commands
4337            if commands:
4338
4339                # Export VCF file
4340                self.export_variant_vcf(
4341                    vcf_file=tmp_vcf_name,
4342                    remove_info=True,
4343                    add_samples=False,
4344                    index=True,
4345                )
4346
4347                # Threads
4348                # calculate threads for annotated commands
4349                if commands:
4350                    threads_bcftools_annotate = round(threads / len(commands))
4351                else:
4352                    threads_bcftools_annotate = 1
4353
4354                if not threads_bcftools_annotate:
4355                    threads_bcftools_annotate = 1
4356
4357                # Add threads option to bcftools commands
4358                if threads_bcftools_annotate > 1:
4359                    commands_threaded = []
4360                    for command in commands:
4361                        commands_threaded.append(
4362                            command.replace(
4363                                f"{bcftools_bin_command} annotate ",
4364                                f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ",
4365                            )
4366                        )
4367                    commands = commands_threaded
4368
4369                # Command annotation multithreading
4370                log.debug(f"Annotation - Annotation commands: " + str(commands))
4371                log.info(
4372                    f"Annotation - Annotation multithreaded in "
4373                    + str(len(commands))
4374                    + " commands"
4375                )
4376
4377                run_parallel_commands(commands, threads)
4378
4379                # Merge
4380                tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list)
4381
4382                if tmp_ann_vcf_list_cmd:
4383
4384                    # Tmp file
4385                    tmp_annotate_vcf = NamedTemporaryFile(
4386                        prefix=self.get_prefix(),
4387                        dir=self.get_tmp_dir(),
4388                        suffix=".vcf.gz",
4389                        delete=True,
4390                    )
4391                    tmp_annotate_vcf_name = tmp_annotate_vcf.name
4392                    tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
4393                    err_files.append(tmp_annotate_vcf_name_err)
4394
4395                    # Tmp file remove command
4396                    tmp_files_remove_command = ""
4397                    if tmp_files:
4398                        tmp_files_remove_command = " && rm -f " + " ".join(tmp_files)
4399
4400                    # Command merge
4401                    merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}"
4402                    log.info(
4403                        f"Annotation - Annotation merging "
4404                        + str(len(commands))
4405                        + " annotated files"
4406                    )
4407                    log.debug(f"Annotation - merge command: {merge_command}")
4408                    run_parallel_commands([merge_command], 1)
4409
4410                    # Error messages
4411                    log.info(f"Error/Warning messages:")
4412                    error_message_command_all = []
4413                    error_message_command_warning = []
4414                    error_message_command_err = []
4415                    for err_file in err_files:
4416                        with open(err_file, "r") as f:
4417                            for line in f:
4418                                message = line.strip()
4419                                error_message_command_all.append(message)
4420                                if line.startswith("[W::"):
4421                                    error_message_command_warning.append(message)
4422                                if line.startswith("[E::"):
4423                                    error_message_command_err.append(
4424                                        f"{err_file}: " + message
4425                                    )
4426                    # log info
4427                    for message in list(
4428                        set(error_message_command_err + error_message_command_warning)
4429                    ):
4430                        log.info(f"   {message}")
4431                    # debug info
4432                    for message in list(set(error_message_command_all)):
4433                        log.debug(f"   {message}")
4434                    # failed
4435                    if len(error_message_command_err):
4436                        log.error("Annotation failed: Error in commands")
4437                        raise ValueError("Annotation failed: Error in commands")
4438
4439                    # Update variants
4440                    log.info(f"Annotation - Updating...")
4441                    self.update_from_vcf(tmp_annotate_vcf_name)

This function annotate with bcftools

Parameters
  • threads: Number of threads to use
Returns

the value of the variable "return_value".

def annotation_exomiser(self, threads: int = None) -> None:
4443    def annotation_exomiser(self, threads: int = None) -> None:
4444        """
4445        This function annotate with Exomiser
4446
4447        This function uses args as parameters, in section "annotation" -> "exomiser", with sections:
4448        - "analysis" (dict/file):
4449            Full analysis dictionnary parameters (see Exomiser docs).
4450            Either a dict, or a file in JSON or YAML format.
4451            These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO)
4452            Default : None
4453        - "preset" (string):
4454            Analysis preset (available in config folder).
4455            Used if no full "analysis" is provided.
4456            Default: "exome"
4457        - "phenopacket" (dict/file):
4458            Samples and phenotipic features parameters (see Exomiser docs).
4459            Either a dict, or a file in JSON or YAML format.
4460            Default: None
4461        - "subject" (dict):
4462            Sample parameters (see Exomiser docs).
4463            Example:
4464                "subject":
4465                    {
4466                        "id": "ISDBM322017",
4467                        "sex": "FEMALE"
4468                    }
4469            Default: None
4470        - "sample" (string):
4471            Sample name to construct "subject" section:
4472                "subject":
4473                    {
4474                        "id": "<sample>",
4475                        "sex": "UNKNOWN_SEX"
4476                    }
4477            Default: None
4478        - "phenotypicFeatures" (dict)
4479            Phenotypic features to construct "subject" section.
4480            Example:
4481                "phenotypicFeatures":
4482                    [
4483                        { "type": { "id": "HP:0001159", "label": "Syndactyly" } },
4484                        { "type": { "id": "HP:0000486", "label": "Strabismus" } }
4485                    ]
4486        - "hpo" (list)
4487            List of HPO ids as phenotypic features.
4488            Example:
4489                "hpo": ['0001156', '0001363', '0011304', '0010055']
4490            Default: []
4491        - "outputOptions" (dict):
4492            Output options (see Exomiser docs).
4493            Default:
4494                "output_options" =
4495                    {
4496                        "outputContributingVariantsOnly": False,
4497                        "numGenes": 0,
4498                        "outputFormats": ["TSV_VARIANT", "VCF"]
4499                    }
4500        - "transcript_source" (string):
4501            Transcript source (either "refseq", "ucsc", "ensembl")
4502            Default: "refseq"
4503        - "exomiser_to_info" (boolean):
4504            Add exomiser TSV file columns as INFO fields in VCF.
4505            Default: False
4506        - "release" (string):
4507            Exomise database release.
4508            If not exists, database release will be downloaded (take a while).
4509            Default: None (provided by application.properties configuration file)
4510        - "exomiser_application_properties" (file):
4511            Exomiser configuration file (see Exomiser docs).
4512            Useful to automatically download databases (especially for specific genome databases).
4513
4514        Notes:
4515        - If no sample in parameters, first sample in VCF will be chosen
4516        - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off
4517
4518        :param threads: The number of threads to use
4519        :return: None.
4520        """
4521
4522        # DEBUG
4523        log.debug("Start annotation with Exomiser databases")
4524
4525        # Threads
4526        if not threads:
4527            threads = self.get_threads()
4528        log.debug("Threads: " + str(threads))
4529
4530        # Config
4531        config = self.get_config()
4532        log.debug("Config: " + str(config))
4533
4534        # Config - Folders - Databases
4535        databases_folders = (
4536            config.get("folders", {})
4537            .get("databases", {})
4538            .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current")
4539        )
4540        databases_folders = full_path(databases_folders)
4541        if not os.path.exists(databases_folders):
4542            log.error(f"Databases annotations: {databases_folders} NOT found")
4543        log.debug("Databases annotations: " + str(databases_folders))
4544
4545        # Config - Exomiser
4546        exomiser_bin_command = get_bin_command(
4547            bin="exomiser-cli*.jar",
4548            tool="exomiser",
4549            bin_type="jar",
4550            config=config,
4551            default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser",
4552        )
4553        log.debug("Exomiser bin command: " + str(exomiser_bin_command))
4554        if not exomiser_bin_command:
4555            msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'"
4556            log.error(msg_err)
4557            raise ValueError(msg_err)
4558
4559        # Param
4560        param = self.get_param()
4561        log.debug("Param: " + str(param))
4562
4563        # Param - Exomiser
4564        param_exomiser = param.get("annotation", {}).get("exomiser", {})
4565        log.debug(f"Param Exomiser: {param_exomiser}")
4566
4567        # Param - Assembly
4568        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
4569        log.debug("Assembly: " + str(assembly))
4570
4571        # Data
4572        table_variants = self.get_table_variants()
4573
4574        # Check if not empty
4575        log.debug("Check if not empty")
4576        sql_query_chromosomes = (
4577            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
4578        )
4579        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
4580            log.info(f"VCF empty")
4581            return False
4582
4583        # VCF header
4584        vcf_reader = self.get_header()
4585        log.debug("Initial header: " + str(vcf_reader.infos))
4586
4587        # Samples
4588        samples = self.get_header_sample_list()
4589        if not samples:
4590            log.error("No Samples in VCF")
4591            return False
4592        log.debug(f"Samples: {samples}")
4593
4594        # Memory limit
4595        memory_limit = self.get_memory("8G")
4596        log.debug(f"memory_limit: {memory_limit}")
4597
4598        # Exomiser java options
4599        exomiser_java_options = (
4600            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
4601        )
4602        log.debug(f"Exomiser java options: {exomiser_java_options}")
4603
4604        # Download Exomiser (if not exists)
4605        exomiser_release = param_exomiser.get("release", None)
4606        exomiser_application_properties = param_exomiser.get(
4607            "exomiser_application_properties", None
4608        )
4609        databases_download_exomiser(
4610            assemblies=[assembly],
4611            exomiser_folder=databases_folders,
4612            exomiser_release=exomiser_release,
4613            exomiser_phenotype_release=exomiser_release,
4614            exomiser_application_properties=exomiser_application_properties,
4615        )
4616
4617        # Force annotation
4618        force_update_annotation = True
4619
4620        if "Exomiser" not in self.get_header().infos or force_update_annotation:
4621            log.debug("Start annotation Exomiser")
4622
4623            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
4624
4625                # tmp_dir = "/tmp/exomiser"
4626
4627                ### ANALYSIS ###
4628                ################
4629
4630                # Create analysis.json through analysis dict
4631                # either analysis in param or by default
4632                # depending on preset exome/genome)
4633
4634                # Init analysis dict
4635                param_exomiser_analysis_dict = {}
4636
4637                # analysis from param
4638                param_exomiser_analysis = param_exomiser.get("analysis", {})
4639                param_exomiser_analysis = full_path(param_exomiser_analysis)
4640
4641                # If analysis in param -> load anlaysis json
4642                if param_exomiser_analysis:
4643
4644                    # If param analysis is a file and exists
4645                    if isinstance(param_exomiser_analysis, str) and os.path.exists(
4646                        param_exomiser_analysis
4647                    ):
4648                        # Load analysis file into analysis dict (either yaml or json)
4649                        with open(param_exomiser_analysis) as json_file:
4650                            param_exomiser_analysis_dict = yaml.safe_load(json_file)
4651
4652                    # If param analysis is a dict
4653                    elif isinstance(param_exomiser_analysis, dict):
4654                        # Load analysis dict into analysis dict (either yaml or json)
4655                        param_exomiser_analysis_dict = param_exomiser_analysis
4656
4657                    # Error analysis type
4658                    else:
4659                        log.error(f"Analysis type unknown. Check param file.")
4660                        raise ValueError(f"Analysis type unknown. Check param file.")
4661
4662                # Case no input analysis config file/dict
4663                # Use preset (exome/genome) to open default config file
4664                if not param_exomiser_analysis_dict:
4665
4666                    # default preset
4667                    default_preset = "exome"
4668
4669                    # Get param preset or default preset
4670                    param_exomiser_preset = param_exomiser.get("preset", default_preset)
4671
4672                    # Try to find if preset is a file
4673                    if os.path.exists(param_exomiser_preset):
4674                        # Preset file is provided in full path
4675                        param_exomiser_analysis_default_config_file = (
4676                            param_exomiser_preset
4677                        )
4678                    # elif os.path.exists(full_path(param_exomiser_preset)):
4679                    #     # Preset file is provided in full path
4680                    #     param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset)
4681                    elif os.path.exists(
4682                        os.path.join(folder_config, param_exomiser_preset)
4683                    ):
4684                        # Preset file is provided a basename in config folder (can be a path with subfolders)
4685                        param_exomiser_analysis_default_config_file = os.path.join(
4686                            folder_config, param_exomiser_preset
4687                        )
4688                    else:
4689                        # Construct preset file
4690                        param_exomiser_analysis_default_config_file = os.path.join(
4691                            folder_config,
4692                            f"preset-{param_exomiser_preset}-analysis.json",
4693                        )
4694
4695                    # If preset file exists
4696                    param_exomiser_analysis_default_config_file = full_path(
4697                        param_exomiser_analysis_default_config_file
4698                    )
4699                    if os.path.exists(param_exomiser_analysis_default_config_file):
4700                        # Load prest file into analysis dict (either yaml or json)
4701                        with open(
4702                            param_exomiser_analysis_default_config_file
4703                        ) as json_file:
4704                            # param_exomiser_analysis_dict[""] = json.load(json_file)
4705                            param_exomiser_analysis_dict["analysis"] = yaml.safe_load(
4706                                json_file
4707                            )
4708
4709                    # Error preset file
4710                    else:
4711                        log.error(
4712                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
4713                        )
4714                        raise ValueError(
4715                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
4716                        )
4717
4718                # If no analysis dict created
4719                if not param_exomiser_analysis_dict:
4720                    log.error(f"No analysis config")
4721                    raise ValueError(f"No analysis config")
4722
4723                # Log
4724                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
4725
4726                ### PHENOPACKET ###
4727                ###################
4728
4729                # If no PhenoPacket in analysis dict -> check in param
4730                if "phenopacket" not in param_exomiser_analysis_dict:
4731
4732                    # If PhenoPacket in param -> load anlaysis json
4733                    if param_exomiser.get("phenopacket", None):
4734
4735                        param_exomiser_phenopacket = param_exomiser.get("phenopacket")
4736                        param_exomiser_phenopacket = full_path(
4737                            param_exomiser_phenopacket
4738                        )
4739
4740                        # If param phenopacket is a file and exists
4741                        if isinstance(
4742                            param_exomiser_phenopacket, str
4743                        ) and os.path.exists(param_exomiser_phenopacket):
4744                            # Load phenopacket file into analysis dict (either yaml or json)
4745                            with open(param_exomiser_phenopacket) as json_file:
4746                                param_exomiser_analysis_dict["phenopacket"] = (
4747                                    yaml.safe_load(json_file)
4748                                )
4749
4750                        # If param phenopacket is a dict
4751                        elif isinstance(param_exomiser_phenopacket, dict):
4752                            # Load phenopacket dict into analysis dict (either yaml or json)
4753                            param_exomiser_analysis_dict["phenopacket"] = (
4754                                param_exomiser_phenopacket
4755                            )
4756
4757                        # Error phenopacket type
4758                        else:
4759                            log.error(f"Phenopacket type unknown. Check param file.")
4760                            raise ValueError(
4761                                f"Phenopacket type unknown. Check param file."
4762                            )
4763
4764                # If no PhenoPacket in analysis dict -> construct from sample and HPO in param
4765                if "phenopacket" not in param_exomiser_analysis_dict:
4766
4767                    # Init PhenoPacket
4768                    param_exomiser_analysis_dict["phenopacket"] = {
4769                        "id": "analysis",
4770                        "proband": {},
4771                    }
4772
4773                    ### Add subject ###
4774
4775                    # If subject exists
4776                    param_exomiser_subject = param_exomiser.get("subject", {})
4777
4778                    # If subject not exists -> found sample ID
4779                    if not param_exomiser_subject:
4780
4781                        # Found sample ID in param
4782                        sample = param_exomiser.get("sample", None)
4783
4784                        # Find sample ID (first sample)
4785                        if not sample:
4786                            sample_list = self.get_header_sample_list()
4787                            if len(sample_list) > 0:
4788                                sample = sample_list[0]
4789                            else:
4790                                log.error(f"No sample found")
4791                                raise ValueError(f"No sample found")
4792
4793                        # Create subject
4794                        param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"}
4795
4796                    # Add to dict
4797                    param_exomiser_analysis_dict["phenopacket"][
4798                        "subject"
4799                    ] = param_exomiser_subject
4800
4801                    ### Add "phenotypicFeatures" ###
4802
4803                    # If phenotypicFeatures exists
4804                    param_exomiser_phenotypicfeatures = param_exomiser.get(
4805                        "phenotypicFeatures", []
4806                    )
4807
4808                    # If phenotypicFeatures not exists -> Try to infer from hpo list
4809                    if not param_exomiser_phenotypicfeatures:
4810
4811                        # Found HPO in param
4812                        param_exomiser_hpo = param_exomiser.get("hpo", [])
4813
4814                        # Split HPO if list in string format separated by comma
4815                        if isinstance(param_exomiser_hpo, str):
4816                            param_exomiser_hpo = param_exomiser_hpo.split(",")
4817
4818                        # Create HPO list
4819                        for hpo in param_exomiser_hpo:
4820                            hpo_clean = re.sub("[^0-9]", "", hpo)
4821                            param_exomiser_phenotypicfeatures.append(
4822                                {
4823                                    "type": {
4824                                        "id": f"HP:{hpo_clean}",
4825                                        "label": f"HP:{hpo_clean}",
4826                                    }
4827                                }
4828                            )
4829
4830                    # Add to dict
4831                    param_exomiser_analysis_dict["phenopacket"][
4832                        "phenotypicFeatures"
4833                    ] = param_exomiser_phenotypicfeatures
4834
4835                    # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step
4836                    if not param_exomiser_phenotypicfeatures:
4837                        for step in param_exomiser_analysis_dict.get(
4838                            "analysis", {}
4839                        ).get("steps", []):
4840                            if "hiPhivePrioritiser" in step:
4841                                param_exomiser_analysis_dict.get("analysis", {}).get(
4842                                    "steps", []
4843                                ).remove(step)
4844
4845                ### Add Input File ###
4846
4847                # Initial file name and htsFiles
4848                tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz")
4849                param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [
4850                    {
4851                        "uri": tmp_vcf_name,
4852                        "htsFormat": "VCF",
4853                        "genomeAssembly": assembly,
4854                    }
4855                ]
4856
4857                ### Add metaData ###
4858
4859                # If metaData not in analysis dict
4860                if "metaData" not in param_exomiser_analysis_dict:
4861                    param_exomiser_analysis_dict["phenopacket"]["metaData"] = {
4862                        "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z",
4863                        "createdBy": "howard",
4864                        "phenopacketSchemaVersion": 1,
4865                    }
4866
4867                ### OutputOptions ###
4868
4869                # Init output result folder
4870                output_results = os.path.join(tmp_dir, "results")
4871
4872                # If no outputOptions in analysis dict
4873                if "outputOptions" not in param_exomiser_analysis_dict:
4874
4875                    # default output formats
4876                    defaut_output_formats = ["TSV_VARIANT", "VCF"]
4877
4878                    # Get outputOptions in param
4879                    output_options = param_exomiser.get("outputOptions", None)
4880
4881                    # If no output_options in param -> check
4882                    if not output_options:
4883                        output_options = {
4884                            "outputContributingVariantsOnly": False,
4885                            "numGenes": 0,
4886                            "outputFormats": defaut_output_formats,
4887                        }
4888
4889                    # Replace outputDirectory in output options
4890                    output_options["outputDirectory"] = output_results
4891                    output_options["outputFileName"] = "howard"
4892
4893                    # Add outputOptions in analysis dict
4894                    param_exomiser_analysis_dict["outputOptions"] = output_options
4895
4896                else:
4897
4898                    # Replace output_results and output format (if exists in param)
4899                    param_exomiser_analysis_dict["outputOptions"][
4900                        "outputDirectory"
4901                    ] = output_results
4902                    param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = (
4903                        list(
4904                            set(
4905                                param_exomiser_analysis_dict.get(
4906                                    "outputOptions", {}
4907                                ).get("outputFormats", [])
4908                                + ["TSV_VARIANT", "VCF"]
4909                            )
4910                        )
4911                    )
4912
4913                # log
4914                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
4915
4916                ### ANALYSIS FILE ###
4917                #####################
4918
4919                ### Full JSON analysis config file ###
4920
4921                exomiser_analysis = os.path.join(tmp_dir, "analysis.json")
4922                with open(exomiser_analysis, "w") as fp:
4923                    json.dump(param_exomiser_analysis_dict, fp, indent=4)
4924
4925                ### SPLIT analysis and sample config files
4926
4927                # Splitted analysis dict
4928                param_exomiser_analysis_dict_for_split = (
4929                    param_exomiser_analysis_dict.copy()
4930                )
4931
4932                # Phenopacket JSON file
4933                exomiser_analysis_phenopacket = os.path.join(
4934                    tmp_dir, "analysis_phenopacket.json"
4935                )
4936                with open(exomiser_analysis_phenopacket, "w") as fp:
4937                    json.dump(
4938                        param_exomiser_analysis_dict_for_split.get("phenopacket"),
4939                        fp,
4940                        indent=4,
4941                    )
4942
4943                # Analysis JSON file without Phenopacket parameters
4944                param_exomiser_analysis_dict_for_split.pop("phenopacket")
4945                exomiser_analysis_analysis = os.path.join(
4946                    tmp_dir, "analysis_analysis.json"
4947                )
4948                with open(exomiser_analysis_analysis, "w") as fp:
4949                    json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4)
4950
4951                ### INITAL VCF file ###
4952                #######################
4953
4954                ### Create list of samples to use and include inti initial VCF file ####
4955
4956                # Subject (main sample)
4957                # Get sample ID in analysis dict
4958                sample_subject = (
4959                    param_exomiser_analysis_dict.get("phenopacket", {})
4960                    .get("subject", {})
4961                    .get("id", None)
4962                )
4963                sample_proband = (
4964                    param_exomiser_analysis_dict.get("phenopacket", {})
4965                    .get("proband", {})
4966                    .get("subject", {})
4967                    .get("id", None)
4968                )
4969                sample = []
4970                if sample_subject:
4971                    sample.append(sample_subject)
4972                if sample_proband:
4973                    sample.append(sample_proband)
4974
4975                # Get sample ID within Pedigree
4976                pedigree_persons_list = (
4977                    param_exomiser_analysis_dict.get("phenopacket", {})
4978                    .get("pedigree", {})
4979                    .get("persons", {})
4980                )
4981
4982                # Create list with all sample ID in pedigree (if exists)
4983                pedigree_persons = []
4984                for person in pedigree_persons_list:
4985                    pedigree_persons.append(person.get("individualId"))
4986
4987                # Concat subject sample ID and samples ID in pedigreesamples
4988                samples = list(set(sample + pedigree_persons))
4989
4990                # Check if sample list is not empty
4991                if not samples:
4992                    log.error(f"No samples found")
4993                    raise ValueError(f"No samples found")
4994
4995                # Create VCF with sample (either sample in param or first one by default)
4996                # Export VCF file
4997                self.export_variant_vcf(
4998                    vcf_file=tmp_vcf_name,
4999                    remove_info=True,
5000                    add_samples=True,
5001                    list_samples=samples,
5002                    index=False,
5003                )
5004
5005                ### Execute Exomiser ###
5006                ########################
5007
5008                # Init command
5009                exomiser_command = ""
5010
5011                # Command exomiser options
5012                exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} "
5013
5014                # Release
5015                exomiser_release = param_exomiser.get("release", None)
5016                if exomiser_release:
5017                    # phenotype data version
5018                    exomiser_options += (
5019                        f" --exomiser.phenotype.data-version={exomiser_release} "
5020                    )
5021                    # data version
5022                    exomiser_options += (
5023                        f" --exomiser.{assembly}.data-version={exomiser_release} "
5024                    )
5025                    # variant white list
5026                    variant_white_list_file = (
5027                        f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz"
5028                    )
5029                    if os.path.exists(
5030                        os.path.join(
5031                            databases_folders, assembly, variant_white_list_file
5032                        )
5033                    ):
5034                        exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} "
5035
5036                # transcript_source
5037                transcript_source = param_exomiser.get(
5038                    "transcript_source", None
5039                )  # ucsc, refseq, ensembl
5040                if transcript_source:
5041                    exomiser_options += (
5042                        f" --exomiser.{assembly}.transcript-source={transcript_source} "
5043                    )
5044
5045                # If analysis contain proband param
5046                if param_exomiser_analysis_dict.get("phenopacket", {}).get(
5047                    "proband", {}
5048                ):
5049                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} "
5050
5051                # If no proband (usually uniq sample)
5052                else:
5053                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}"
5054
5055                # Log
5056                log.debug(f"exomiser_command_analysis={exomiser_command_analysis}")
5057
5058                # Run command
5059                result = subprocess.call(
5060                    exomiser_command_analysis.split(), stdout=subprocess.PIPE
5061                )
5062                if result:
5063                    log.error("Exomiser command failed")
5064                    raise ValueError("Exomiser command failed")
5065
5066                ### RESULTS ###
5067                ###############
5068
5069                ### Annotate with TSV fields ###
5070
5071                # Init result tsv file
5072                exomiser_to_info = param_exomiser.get("exomiser_to_info", False)
5073
5074                # Init result tsv file
5075                output_results_tsv = os.path.join(output_results, "howard.variants.tsv")
5076
5077                # Parse TSV file and explode columns in INFO field
5078                if exomiser_to_info and os.path.exists(output_results_tsv):
5079
5080                    # Log
5081                    log.debug("Exomiser columns to VCF INFO field")
5082
5083                    # Retrieve columns and types
5084                    query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """
5085                    output_results_tsv_df = self.get_query_to_df(query)
5086                    output_results_tsv_columns = output_results_tsv_df.columns.tolist()
5087
5088                    # Init concat fields for update
5089                    sql_query_update_concat_fields = []
5090
5091                    # Fields to avoid
5092                    fields_to_avoid = [
5093                        "CONTIG",
5094                        "START",
5095                        "END",
5096                        "REF",
5097                        "ALT",
5098                        "QUAL",
5099                        "FILTER",
5100                        "GENOTYPE",
5101                    ]
5102
5103                    # List all columns to add into header
5104                    for header_column in output_results_tsv_columns:
5105
5106                        # If header column is enable
5107                        if header_column not in fields_to_avoid:
5108
5109                            # Header info type
5110                            header_info_type = "String"
5111                            header_column_df = output_results_tsv_df[header_column]
5112                            header_column_df_dtype = header_column_df.dtype
5113                            if header_column_df_dtype == object:
5114                                if (
5115                                    pd.to_numeric(header_column_df, errors="coerce")
5116                                    .notnull()
5117                                    .all()
5118                                ):
5119                                    header_info_type = "Float"
5120                            else:
5121                                header_info_type = "Integer"
5122
5123                            # Header info
5124                            characters_to_validate = ["-"]
5125                            pattern = "[" + "".join(characters_to_validate) + "]"
5126                            header_info_name = re.sub(
5127                                pattern,
5128                                "_",
5129                                f"Exomiser_{header_column}".replace("#", ""),
5130                            )
5131                            header_info_number = "."
5132                            header_info_description = (
5133                                f"Exomiser {header_column} annotation"
5134                            )
5135                            header_info_source = "Exomiser"
5136                            header_info_version = "unknown"
5137                            header_info_code = CODE_TYPE_MAP[header_info_type]
5138                            vcf_reader.infos[header_info_name] = vcf.parser._Info(
5139                                header_info_name,
5140                                header_info_number,
5141                                header_info_type,
5142                                header_info_description,
5143                                header_info_source,
5144                                header_info_version,
5145                                header_info_code,
5146                            )
5147
5148                            # Add field to add for update to concat fields
5149                            sql_query_update_concat_fields.append(
5150                                f"""
5151                                CASE
5152                                    WHEN table_parquet."{header_column}" NOT IN ('','.')
5153                                    THEN concat(
5154                                        '{header_info_name}=',
5155                                        table_parquet."{header_column}",
5156                                        ';'
5157                                        )
5158
5159                                    ELSE ''
5160                                END
5161                            """
5162                            )
5163
5164                    # Update query
5165                    sql_query_update = f"""
5166                        UPDATE {table_variants} as table_variants
5167                            SET INFO = concat(
5168                                            CASE
5169                                                WHEN INFO NOT IN ('', '.')
5170                                                THEN INFO
5171                                                ELSE ''
5172                                            END,
5173                                            CASE
5174                                                WHEN table_variants.INFO NOT IN ('','.')
5175                                                THEN ';'
5176                                                ELSE ''
5177                                            END,
5178                                            (
5179                                            SELECT 
5180                                                concat(
5181                                                    {",".join(sql_query_update_concat_fields)}
5182                                                )
5183                                            FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet
5184                                                    WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\"
5185                                                    AND table_parquet.\"START\" = table_variants.\"POS\"
5186                                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
5187                                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
5188                                            )
5189                                        )
5190                            ;
5191                        """
5192
5193                    # Update
5194                    self.conn.execute(sql_query_update)
5195
5196                ### Annotate with VCF INFO field ###
5197
5198                # Init result VCF file
5199                output_results_vcf = os.path.join(output_results, "howard.vcf.gz")
5200
5201                # If VCF exists
5202                if os.path.exists(output_results_vcf):
5203
5204                    # Log
5205                    log.debug("Exomiser result VCF update variants")
5206
5207                    # Find Exomiser INFO field annotation in header
5208                    with gzip.open(output_results_vcf, "rt") as f:
5209                        header_list = self.read_vcf_header(f)
5210                    exomiser_vcf_header = vcf.Reader(
5211                        io.StringIO("\n".join(header_list))
5212                    )
5213
5214                    # Add annotation INFO field to header
5215                    vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"]
5216
5217                    # Update variants with VCF
5218                    self.update_from_vcf(output_results_vcf)
5219
5220        return True

This function annotate with Exomiser

This function uses args as parameters, in section "annotation" -> "exomiser", with sections:

  • "analysis" (dict/file): Full analysis dictionnary parameters (see Exomiser docs). Either a dict, or a file in JSON or YAML format. These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO) Default : None
  • "preset" (string): Analysis preset (available in config folder). Used if no full "analysis" is provided. Default: "exome"
  • "phenopacket" (dict/file): Samples and phenotipic features parameters (see Exomiser docs). Either a dict, or a file in JSON or YAML format. Default: None
  • "subject" (dict): Sample parameters (see Exomiser docs). Example: "subject": { "id": "ISDBM322017", "sex": "FEMALE" } Default: None
  • "sample" (string): Sample name to construct "subject" section: "subject": { "id": "", "sex": "UNKNOWN_SEX" } Default: None
  • "phenotypicFeatures" (dict) Phenotypic features to construct "subject" section. Example: "phenotypicFeatures": [ { "type": { "id": "HP:0001159", "label": "Syndactyly" } }, { "type": { "id": "HP:0000486", "label": "Strabismus" } } ]
  • "hpo" (list) List of HPO ids as phenotypic features. Example: "hpo": ['0001156', '0001363', '0011304', '0010055'] Default: []
  • "outputOptions" (dict): Output options (see Exomiser docs). Default: "output_options" = { "outputContributingVariantsOnly": False, "numGenes": 0, "outputFormats": ["TSV_VARIANT", "VCF"] }
  • "transcript_source" (string): Transcript source (either "refseq", "ucsc", "ensembl") Default: "refseq"
  • "exomiser_to_info" (boolean): Add exomiser TSV file columns as INFO fields in VCF. Default: False
  • "release" (string): Exomise database release. If not exists, database release will be downloaded (take a while). Default: None (provided by application.properties configuration file)
  • "exomiser_application_properties" (file): Exomiser configuration file (see Exomiser docs). Useful to automatically download databases (especially for specific genome databases).

Notes:

  • If no sample in parameters, first sample in VCF will be chosen
  • If no HPO found, "hiPhivePrioritiser" analysis step will be switch off
Parameters
  • threads: The number of threads to use
Returns

None.

def annotation_snpeff(self, threads: int = None) -> None:
5222    def annotation_snpeff(self, threads: int = None) -> None:
5223        """
5224        This function annotate with snpEff
5225
5226        :param threads: The number of threads to use
5227        :return: the value of the variable "return_value".
5228        """
5229
5230        # DEBUG
5231        log.debug("Start annotation with snpeff databases")
5232
5233        # Threads
5234        if not threads:
5235            threads = self.get_threads()
5236        log.debug("Threads: " + str(threads))
5237
5238        # DEBUG
5239        delete_tmp = True
5240        if self.get_config().get("verbosity", "warning") in ["debug"]:
5241            delete_tmp = False
5242            log.debug("Delete tmp files/folders: " + str(delete_tmp))
5243
5244        # Config
5245        config = self.get_config()
5246        log.debug("Config: " + str(config))
5247
5248        # Config - Folders - Databases
5249        databases_folders = (
5250            config.get("folders", {}).get("databases", {}).get("snpeff", ["."])
5251        )
5252        log.debug("Databases annotations: " + str(databases_folders))
5253
5254        # Config - snpEff bin command
5255        snpeff_bin_command = get_bin_command(
5256            bin="snpEff.jar",
5257            tool="snpeff",
5258            bin_type="jar",
5259            config=config,
5260            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
5261        )
5262        if not snpeff_bin_command:
5263            msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'"
5264            log.error(msg_err)
5265            raise ValueError(msg_err)
5266
5267        # Config - snpEff databases
5268        snpeff_databases = (
5269            config.get("folders", {})
5270            .get("databases", {})
5271            .get("snpeff", DEFAULT_SNPEFF_FOLDER)
5272        )
5273        snpeff_databases = full_path(snpeff_databases)
5274        if snpeff_databases is not None and snpeff_databases != "":
5275            log.debug(f"Create snpEff databases folder")
5276            if not os.path.exists(snpeff_databases):
5277                os.makedirs(snpeff_databases)
5278
5279        # Param
5280        param = self.get_param()
5281        log.debug("Param: " + str(param))
5282
5283        # Param
5284        options = param.get("annotation", {}).get("snpeff", {}).get("options", None)
5285        log.debug("Options: " + str(options))
5286
5287        # Param - Assembly
5288        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
5289
5290        # Param - Options
5291        snpeff_options = (
5292            param.get("annotation", {}).get("snpeff", {}).get("options", "")
5293        )
5294        snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None)
5295        snpeff_csvstats = (
5296            param.get("annotation", {}).get("snpeff", {}).get("csvStats", None)
5297        )
5298        if snpeff_stats:
5299            snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output())
5300            snpeff_stats = full_path(snpeff_stats)
5301            snpeff_options += f" -stats {snpeff_stats}"
5302        if snpeff_csvstats:
5303            snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output())
5304            snpeff_csvstats = full_path(snpeff_csvstats)
5305            snpeff_options += f" -csvStats {snpeff_csvstats}"
5306
5307        # Data
5308        table_variants = self.get_table_variants()
5309
5310        # Check if not empty
5311        log.debug("Check if not empty")
5312        sql_query_chromosomes = (
5313            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
5314        )
5315        # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]:
5316        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
5317            log.info(f"VCF empty")
5318            return
5319
5320        # Export in VCF
5321        log.debug("Create initial file to annotate")
5322        tmp_vcf = NamedTemporaryFile(
5323            prefix=self.get_prefix(),
5324            dir=self.get_tmp_dir(),
5325            suffix=".vcf.gz",
5326            delete=True,
5327        )
5328        tmp_vcf_name = tmp_vcf.name
5329
5330        # VCF header
5331        vcf_reader = self.get_header()
5332        log.debug("Initial header: " + str(vcf_reader.infos))
5333
5334        # Existing annotations
5335        for vcf_annotation in self.get_header().infos:
5336
5337            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
5338            log.debug(
5339                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
5340            )
5341
5342        # Memory limit
5343        # if config.get("memory", None):
5344        #     memory_limit = config.get("memory", "8G")
5345        # else:
5346        #     memory_limit = "8G"
5347        memory_limit = self.get_memory("8G")
5348        log.debug(f"memory_limit: {memory_limit}")
5349
5350        # snpEff java options
5351        snpeff_java_options = (
5352            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
5353        )
5354        log.debug(f"Exomiser java options: {snpeff_java_options}")
5355
5356        force_update_annotation = True
5357
5358        if "ANN" not in self.get_header().infos or force_update_annotation:
5359
5360            # Check snpEff database
5361            log.debug(f"Check snpEff databases {[assembly]}")
5362            databases_download_snpeff(
5363                folder=snpeff_databases, assemblies=[assembly], config=config
5364            )
5365
5366            # Export VCF file
5367            self.export_variant_vcf(
5368                vcf_file=tmp_vcf_name,
5369                remove_info=True,
5370                add_samples=False,
5371                index=True,
5372            )
5373
5374            # Tmp file
5375            err_files = []
5376            tmp_annotate_vcf = NamedTemporaryFile(
5377                prefix=self.get_prefix(),
5378                dir=self.get_tmp_dir(),
5379                suffix=".vcf",
5380                delete=False,
5381            )
5382            tmp_annotate_vcf_name = tmp_annotate_vcf.name
5383            tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
5384            err_files.append(tmp_annotate_vcf_name_err)
5385
5386            # Command
5387            snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}"
5388            log.debug(f"Annotation - snpEff command: {snpeff_command}")
5389            run_parallel_commands([snpeff_command], 1)
5390
5391            # Error messages
5392            log.info(f"Error/Warning messages:")
5393            error_message_command_all = []
5394            error_message_command_warning = []
5395            error_message_command_err = []
5396            for err_file in err_files:
5397                with open(err_file, "r") as f:
5398                    for line in f:
5399                        message = line.strip()
5400                        error_message_command_all.append(message)
5401                        if line.startswith("[W::"):
5402                            error_message_command_warning.append(message)
5403                        if line.startswith("[E::"):
5404                            error_message_command_err.append(f"{err_file}: " + message)
5405            # log info
5406            for message in list(
5407                set(error_message_command_err + error_message_command_warning)
5408            ):
5409                log.info(f"   {message}")
5410            # debug info
5411            for message in list(set(error_message_command_all)):
5412                log.debug(f"   {message}")
5413            # failed
5414            if len(error_message_command_err):
5415                log.error("Annotation failed: Error in commands")
5416                raise ValueError("Annotation failed: Error in commands")
5417
5418            # Find annotation in header
5419            with open(tmp_annotate_vcf_name, "rt") as f:
5420                header_list = self.read_vcf_header(f)
5421            annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
5422
5423            for ann in annovar_vcf_header.infos:
5424                if ann not in self.get_header().infos:
5425                    vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
5426
5427            # Update variants
5428            log.info(f"Annotation - Updating...")
5429            self.update_from_vcf(tmp_annotate_vcf_name)
5430
5431        else:
5432            if "ANN" in self.get_header().infos:
5433                log.debug(f"Existing snpEff annotations in VCF")
5434            if force_update_annotation:
5435                log.debug(f"Existing snpEff annotations in VCF - annotation forced")

This function annotate with snpEff

Parameters
  • threads: The number of threads to use
Returns

the value of the variable "return_value".

def annotation_annovar(self, threads: int = None) -> None:
5437    def annotation_annovar(self, threads: int = None) -> None:
5438        """
5439        It takes a VCF file, annotates it with Annovar, and then updates the database with the new
5440        annotations
5441
5442        :param threads: number of threads to use
5443        :return: the value of the variable "return_value".
5444        """
5445
5446        # DEBUG
5447        log.debug("Start annotation with Annovar databases")
5448
5449        # Threads
5450        if not threads:
5451            threads = self.get_threads()
5452        log.debug("Threads: " + str(threads))
5453
5454        # Tmp en Err files
5455        tmp_files = []
5456        err_files = []
5457
5458        # DEBUG
5459        delete_tmp = True
5460        if self.get_config().get("verbosity", "warning") in ["debug"]:
5461            delete_tmp = False
5462            log.debug("Delete tmp files/folders: " + str(delete_tmp))
5463
5464        # Config
5465        config = self.get_config()
5466        log.debug("Config: " + str(config))
5467
5468        # Config - Folders - Databases
5469        databases_folders = (
5470            config.get("folders", {}).get("databases", {}).get("annovar", ["."])
5471        )
5472        log.debug("Databases annotations: " + str(databases_folders))
5473
5474        # Config - annovar bin command
5475        annovar_bin_command = get_bin_command(
5476            bin="table_annovar.pl",
5477            tool="annovar",
5478            bin_type="perl",
5479            config=config,
5480            default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar",
5481        )
5482        if not annovar_bin_command:
5483            msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'"
5484            log.error(msg_err)
5485            raise ValueError(msg_err)
5486
5487        # Config - BCFTools bin command
5488        bcftools_bin_command = get_bin_command(
5489            bin="bcftools",
5490            tool="bcftools",
5491            bin_type="bin",
5492            config=config,
5493            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
5494        )
5495        if not bcftools_bin_command:
5496            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
5497            log.error(msg_err)
5498            raise ValueError(msg_err)
5499
5500        # Config - annovar databases
5501        annovar_databases = (
5502            config.get("folders", {})
5503            .get("databases", {})
5504            .get("annovar", DEFAULT_ANNOVAR_FOLDER)
5505        )
5506        if annovar_databases is not None:
5507            if isinstance(annovar_databases, list):
5508                annovar_databases = full_path(annovar_databases[0])
5509                log.warning(f"Annovar databases folder '{annovar_databases}' selected")
5510            annovar_databases = full_path(annovar_databases)
5511            if not os.path.exists(annovar_databases):
5512                log.info(f"Annovar databases folder '{annovar_databases}' created")
5513                Path(annovar_databases).mkdir(parents=True, exist_ok=True)
5514        else:
5515            msg_err = f"Annovar databases configuration failed"
5516            log.error(msg_err)
5517            raise ValueError(msg_err)
5518
5519        # Param
5520        param = self.get_param()
5521        log.debug("Param: " + str(param))
5522
5523        # Param - options
5524        options = param.get("annotation", {}).get("annovar", {}).get("options", {})
5525        log.debug("Options: " + str(options))
5526
5527        # Param - annotations
5528        annotations = (
5529            param.get("annotation", {}).get("annovar", {}).get("annotations", {})
5530        )
5531        log.debug("Annotations: " + str(annotations))
5532
5533        # Param - Assembly
5534        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
5535
5536        # Annovar database assembly
5537        annovar_databases_assembly = f"{annovar_databases}/{assembly}"
5538        if annovar_databases_assembly != "" and not os.path.exists(
5539            annovar_databases_assembly
5540        ):
5541            os.makedirs(annovar_databases_assembly)
5542
5543        # Data
5544        table_variants = self.get_table_variants()
5545
5546        # Check if not empty
5547        log.debug("Check if not empty")
5548        sql_query_chromosomes = (
5549            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
5550        )
5551        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
5552        if not sql_query_chromosomes_df["count"][0]:
5553            log.info(f"VCF empty")
5554            return
5555
5556        # VCF header
5557        vcf_reader = self.get_header()
5558        log.debug("Initial header: " + str(vcf_reader.infos))
5559
5560        # Existing annotations
5561        for vcf_annotation in self.get_header().infos:
5562
5563            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
5564            log.debug(
5565                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
5566            )
5567
5568        force_update_annotation = True
5569
5570        if annotations:
5571
5572            commands = []
5573            tmp_annotates_vcf_name_list = []
5574
5575            # Export in VCF
5576            log.debug("Create initial file to annotate")
5577            tmp_vcf = NamedTemporaryFile(
5578                prefix=self.get_prefix(),
5579                dir=self.get_tmp_dir(),
5580                suffix=".vcf.gz",
5581                delete=False,
5582            )
5583            tmp_vcf_name = tmp_vcf.name
5584            tmp_files.append(tmp_vcf_name)
5585            tmp_files.append(tmp_vcf_name + ".tbi")
5586
5587            # Export VCF file
5588            self.export_variant_vcf(
5589                vcf_file=tmp_vcf_name,
5590                remove_info=".",
5591                add_samples=False,
5592                index=True,
5593            )
5594
5595            # Create file for field rename
5596            log.debug("Create file for field rename")
5597            tmp_rename = NamedTemporaryFile(
5598                prefix=self.get_prefix(),
5599                dir=self.get_tmp_dir(),
5600                suffix=".rename",
5601                delete=False,
5602            )
5603            tmp_rename_name = tmp_rename.name
5604            tmp_files.append(tmp_rename_name)
5605
5606            # Check Annovar database
5607            log.debug(
5608                f"Check Annovar databases {[assembly]}: {list(annotations.keys())}"
5609            )
5610            databases_download_annovar(
5611                folder=annovar_databases,
5612                files=list(annotations.keys()),
5613                assemblies=[assembly],
5614            )
5615
5616            for annotation in annotations:
5617                annotation_fields = annotations[annotation]
5618
5619                if not annotation_fields:
5620                    annotation_fields = {"INFO": None}
5621
5622                log.info(f"Annotations Annovar - database '{annotation}'")
5623                log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}")
5624
5625                # Tmp file for annovar
5626                err_files = []
5627                tmp_annotate_vcf_directory = TemporaryDirectory(
5628                    prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar"
5629                )
5630                tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar"
5631                tmp_annotate_vcf_name_annovar = (
5632                    tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf"
5633                )
5634                tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err"
5635                err_files.append(tmp_annotate_vcf_name_err)
5636                tmp_files.append(tmp_annotate_vcf_name_err)
5637
5638                # Tmp file final vcf annotated by annovar
5639                tmp_annotate_vcf = NamedTemporaryFile(
5640                    prefix=self.get_prefix(),
5641                    dir=self.get_tmp_dir(),
5642                    suffix=".vcf.gz",
5643                    delete=False,
5644                )
5645                tmp_annotate_vcf_name = tmp_annotate_vcf.name
5646                tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name)
5647                tmp_files.append(tmp_annotate_vcf_name)
5648                tmp_files.append(tmp_annotate_vcf_name + ".tbi")
5649
5650                # Number of fields
5651                annotation_list = []
5652                annotation_renamed_list = []
5653
5654                for annotation_field in annotation_fields:
5655
5656                    # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
5657                    annotation_fields_new_name = annotation_fields.get(
5658                        annotation_field, annotation_field
5659                    )
5660                    if not annotation_fields_new_name:
5661                        annotation_fields_new_name = annotation_field
5662
5663                    if (
5664                        force_update_annotation
5665                        or annotation_fields_new_name not in self.get_header().infos
5666                    ):
5667                        annotation_list.append(annotation_field)
5668                        annotation_renamed_list.append(annotation_fields_new_name)
5669                    else:  # annotation_fields_new_name in self.get_header().infos and not force_update_annotation:
5670                        log.warning(
5671                            f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
5672                        )
5673
5674                    # Add rename info
5675                    run_parallel_commands(
5676                        [
5677                            f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}"
5678                        ],
5679                        1,
5680                    )
5681
5682                # log.debug("fields_to_removed: " + str(fields_to_removed))
5683                log.debug("annotation_list: " + str(annotation_list))
5684
5685                # protocol
5686                protocol = annotation
5687
5688                # argument
5689                argument = ""
5690
5691                # operation
5692                operation = "f"
5693                if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith(
5694                    "ensGene"
5695                ):
5696                    operation = "g"
5697                    if options.get("genebase", None):
5698                        argument = f"""'{options.get("genebase","")}'"""
5699                elif annotation in ["cytoBand"]:
5700                    operation = "r"
5701
5702                # argument option
5703                argument_option = ""
5704                if argument != "":
5705                    argument_option = " --argument " + argument
5706
5707                # command options
5708                command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """  # --intronhgvs 10
5709                for option in options:
5710                    if option not in ["genebase"]:
5711                        command_options += f""" --{option}={options[option]}"""
5712
5713                # Command
5714
5715                # Command - Annovar
5716                command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """
5717                tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf")
5718
5719                # Command - start pipe
5720                command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """
5721
5722                # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!)
5723                command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """
5724
5725                # Command - Special characters (refGene annotation)
5726                command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """
5727
5728                # Command - Clean empty fields (with value ".")
5729                command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """
5730
5731                # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file
5732                annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"]
5733                if "ALL" not in annotation_list and "INFO" not in annotation_list:
5734                    # for ann in annotation_renamed_list:
5735                    for ann in annotation_list:
5736                        annovar_fields_to_keep.append(f"^INFO/{ann}")
5737
5738                command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """
5739
5740                # Command - indexing
5741                command_annovar += f"""  && tabix {tmp_annotate_vcf_name} """
5742
5743                log.debug(f"Annotation - Annovar command: {command_annovar}")
5744                run_parallel_commands([command_annovar], 1)
5745
5746                # Error messages
5747                log.info(f"Error/Warning messages:")
5748                error_message_command_all = []
5749                error_message_command_warning = []
5750                error_message_command_err = []
5751                for err_file in err_files:
5752                    with open(err_file, "r") as f:
5753                        for line in f:
5754                            message = line.strip()
5755                            error_message_command_all.append(message)
5756                            if line.startswith("[W::") or line.startswith("WARNING"):
5757                                error_message_command_warning.append(message)
5758                            if line.startswith("[E::") or line.startswith("ERROR"):
5759                                error_message_command_err.append(
5760                                    f"{err_file}: " + message
5761                                )
5762                # log info
5763                for message in list(
5764                    set(error_message_command_err + error_message_command_warning)
5765                ):
5766                    log.info(f"   {message}")
5767                # debug info
5768                for message in list(set(error_message_command_all)):
5769                    log.debug(f"   {message}")
5770                # failed
5771                if len(error_message_command_err):
5772                    log.error("Annotation failed: Error in commands")
5773                    raise ValueError("Annotation failed: Error in commands")
5774
5775            if tmp_annotates_vcf_name_list:
5776
5777                # List of annotated files
5778                tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list)
5779
5780                # Tmp file
5781                tmp_annotate_vcf = NamedTemporaryFile(
5782                    prefix=self.get_prefix(),
5783                    dir=self.get_tmp_dir(),
5784                    suffix=".vcf.gz",
5785                    delete=False,
5786                )
5787                tmp_annotate_vcf_name = tmp_annotate_vcf.name
5788                tmp_files.append(tmp_annotate_vcf_name)
5789                tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
5790                err_files.append(tmp_annotate_vcf_name_err)
5791                tmp_files.append(tmp_annotate_vcf_name_err)
5792
5793                # Command merge
5794                merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} "
5795                log.info(
5796                    f"Annotation Annovar - Annotation merging "
5797                    + str(len(tmp_annotates_vcf_name_list))
5798                    + " annotated files"
5799                )
5800                log.debug(f"Annotation - merge command: {merge_command}")
5801                run_parallel_commands([merge_command], 1)
5802
5803                # Find annotation in header
5804                with bgzf.open(tmp_annotate_vcf_name, "rt") as f:
5805                    header_list = self.read_vcf_header(f)
5806                annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
5807
5808                for ann in annovar_vcf_header.infos:
5809                    if ann not in self.get_header().infos:
5810                        vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
5811
5812                # Update variants
5813                log.info(f"Annotation Annovar - Updating...")
5814                self.update_from_vcf(tmp_annotate_vcf_name)
5815
5816            # Clean files
5817            # Tmp file remove command
5818            if True:
5819                tmp_files_remove_command = ""
5820                if tmp_files:
5821                    tmp_files_remove_command = " ".join(tmp_files)
5822                clean_command = f" rm -f {tmp_files_remove_command} "
5823                log.debug(f"Annotation Annovar - Annotation cleaning ")
5824                log.debug(f"Annotation - cleaning command: {clean_command}")
5825                run_parallel_commands([clean_command], 1)

It takes a VCF file, annotates it with Annovar, and then updates the database with the new annotations

Parameters
  • threads: number of threads to use
Returns

the value of the variable "return_value".

def annotation_parquet(self, threads: int = None) -> None:
5828    def annotation_parquet(self, threads: int = None) -> None:
5829        """
5830        It takes a VCF file, and annotates it with a parquet file
5831
5832        :param threads: number of threads to use for the annotation
5833        :return: the value of the variable "result".
5834        """
5835
5836        # DEBUG
5837        log.debug("Start annotation with parquet databases")
5838
5839        # Threads
5840        if not threads:
5841            threads = self.get_threads()
5842        log.debug("Threads: " + str(threads))
5843
5844        # DEBUG
5845        delete_tmp = True
5846        if self.get_config().get("verbosity", "warning") in ["debug"]:
5847            delete_tmp = False
5848            log.debug("Delete tmp files/folders: " + str(delete_tmp))
5849
5850        # Config
5851        databases_folders = set(
5852            self.get_config()
5853            .get("folders", {})
5854            .get("databases", {})
5855            .get("annotations", ["."])
5856            + self.get_config()
5857            .get("folders", {})
5858            .get("databases", {})
5859            .get("parquet", ["."])
5860        )
5861        log.debug("Databases annotations: " + str(databases_folders))
5862
5863        # Param
5864        annotations = (
5865            self.get_param()
5866            .get("annotation", {})
5867            .get("parquet", {})
5868            .get("annotations", None)
5869        )
5870        log.debug("Annotations: " + str(annotations))
5871
5872        # Assembly
5873        assembly = self.get_param().get(
5874            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
5875        )
5876
5877        # Force Update Annotation
5878        force_update_annotation = (
5879            self.get_param()
5880            .get("annotation", {})
5881            .get("options", {})
5882            .get("annotations_update", False)
5883        )
5884        log.debug(f"force_update_annotation={force_update_annotation}")
5885        force_append_annotation = (
5886            self.get_param()
5887            .get("annotation", {})
5888            .get("options", {})
5889            .get("annotations_append", False)
5890        )
5891        log.debug(f"force_append_annotation={force_append_annotation}")
5892
5893        # Data
5894        table_variants = self.get_table_variants()
5895
5896        # Check if not empty
5897        log.debug("Check if not empty")
5898        sql_query_chromosomes_df = self.get_query_to_df(
5899            f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1"""
5900        )
5901        if not sql_query_chromosomes_df["count"][0]:
5902            log.info(f"VCF empty")
5903            return
5904
5905        # VCF header
5906        vcf_reader = self.get_header()
5907        log.debug("Initial header: " + str(vcf_reader.infos))
5908
5909        # Nb Variants POS
5910        log.debug("NB Variants Start")
5911        nb_variants = self.conn.execute(
5912            f"SELECT count(*) AS count FROM variants"
5913        ).fetchdf()["count"][0]
5914        log.debug("NB Variants Stop")
5915
5916        # Existing annotations
5917        for vcf_annotation in self.get_header().infos:
5918
5919            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
5920            log.debug(
5921                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
5922            )
5923
5924        # Added columns
5925        added_columns = []
5926
5927        # drop indexes
5928        log.debug(f"Drop indexes...")
5929        self.drop_indexes()
5930
5931        if annotations:
5932
5933            if "ALL" in annotations:
5934
5935                all_param = annotations.get("ALL", {})
5936                all_param_formats = all_param.get("formats", None)
5937                all_param_releases = all_param.get("releases", None)
5938
5939                databases_infos_dict = self.scan_databases(
5940                    database_formats=all_param_formats,
5941                    database_releases=all_param_releases,
5942                )
5943                for database_infos in databases_infos_dict.keys():
5944                    if database_infos not in annotations:
5945                        annotations[database_infos] = {"INFO": None}
5946
5947            for annotation in annotations:
5948
5949                if annotation in ["ALL"]:
5950                    continue
5951
5952                # Annotation Name
5953                annotation_name = os.path.basename(annotation)
5954
5955                # Annotation fields
5956                annotation_fields = annotations[annotation]
5957                if not annotation_fields:
5958                    annotation_fields = {"INFO": None}
5959
5960                log.debug(f"Annotation '{annotation_name}'")
5961                log.debug(
5962                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
5963                )
5964
5965                # Create Database
5966                database = Database(
5967                    database=annotation,
5968                    databases_folders=databases_folders,
5969                    assembly=assembly,
5970                )
5971
5972                # Find files
5973                parquet_file = database.get_database()
5974                parquet_hdr_file = database.get_header_file()
5975                parquet_type = database.get_type()
5976
5977                # Check if files exists
5978                if not parquet_file or not parquet_hdr_file:
5979                    msg_err_list = []
5980                    if not parquet_file:
5981                        msg_err_list.append(
5982                            f"Annotation failed: Annotation file not found"
5983                        )
5984                    if parquet_file and not parquet_hdr_file:
5985                        msg_err_list.append(
5986                            f"Annotation failed: Annotation file '{parquet_file}' header not found. Check for file '{parquet_file}.hdr'"
5987                        )
5988
5989                    log.error(". ".join(msg_err_list))
5990                    raise ValueError(". ".join(msg_err_list))
5991                else:
5992                    # Get parquet connexion
5993                    parquet_sql_attach = database.get_sql_database_attach(
5994                        output="query"
5995                    )
5996                    if parquet_sql_attach:
5997                        self.conn.execute(parquet_sql_attach)
5998                    parquet_file_link = database.get_sql_database_link()
5999                    # Log
6000                    log.debug(
6001                        f"Annotation '{annotation_name}' - file: "
6002                        + str(parquet_file)
6003                        + " and "
6004                        + str(parquet_hdr_file)
6005                    )
6006
6007                    # Database full header columns
6008                    parquet_hdr_vcf_header_columns = database.get_header_file_columns(
6009                        parquet_hdr_file
6010                    )
6011                    # Log
6012                    log.debug(
6013                        "Annotation database header columns : "
6014                        + str(parquet_hdr_vcf_header_columns)
6015                    )
6016
6017                    # Load header as VCF object
6018                    parquet_hdr_vcf_header_infos = database.get_header().infos
6019                    # Log
6020                    log.debug(
6021                        "Annotation database header: "
6022                        + str(parquet_hdr_vcf_header_infos)
6023                    )
6024
6025                    # Get extra infos
6026                    parquet_columns = database.get_extra_columns()
6027                    # Log
6028                    log.debug("Annotation database Columns: " + str(parquet_columns))
6029
6030                    # Add extra columns if "ALL" in annotation_fields
6031                    # if "ALL" in annotation_fields:
6032                    #     allow_add_extra_column = True
6033                    if "ALL" in annotation_fields and database.get_extra_columns():
6034                        for extra_column in database.get_extra_columns():
6035                            if (
6036                                extra_column not in annotation_fields
6037                                and extra_column.replace("INFO/", "")
6038                                not in parquet_hdr_vcf_header_infos
6039                            ):
6040                                parquet_hdr_vcf_header_infos[extra_column] = (
6041                                    vcf.parser._Info(
6042                                        extra_column,
6043                                        ".",
6044                                        "String",
6045                                        f"{extra_column} description",
6046                                        "unknown",
6047                                        "unknown",
6048                                        self.code_type_map["String"],
6049                                    )
6050                                )
6051
6052                    # For all fields in database
6053                    annotation_fields_all = False
6054                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
6055                        annotation_fields_all = True
6056                        annotation_fields = {
6057                            key: key for key in parquet_hdr_vcf_header_infos
6058                        }
6059
6060                        log.debug(
6061                            "Annotation database header - All annotations added: "
6062                            + str(annotation_fields)
6063                        )
6064
6065                    # Init
6066
6067                    # List of annotation fields to use
6068                    sql_query_annotation_update_info_sets = []
6069
6070                    # List of annotation to agregate
6071                    sql_query_annotation_to_agregate = []
6072
6073                    # Number of fields
6074                    nb_annotation_field = 0
6075
6076                    # Annotation fields processed
6077                    annotation_fields_processed = []
6078
6079                    # Columns mapping
6080                    map_columns = database.map_columns(
6081                        columns=annotation_fields, prefixes=["INFO/"]
6082                    )
6083
6084                    # Query dict for fields to remove (update option)
6085                    query_dict_remove = {}
6086
6087                    # Fetch Anotation fields
6088                    for annotation_field in annotation_fields:
6089
6090                        # annotation_field_column
6091                        annotation_field_column = map_columns.get(
6092                            annotation_field, "INFO"
6093                        )
6094
6095                        # field new name, if parametered
6096                        annotation_fields_new_name = annotation_fields.get(
6097                            annotation_field, annotation_field
6098                        )
6099                        if not annotation_fields_new_name:
6100                            annotation_fields_new_name = annotation_field
6101
6102                        # To annotate
6103                        # force_update_annotation = True
6104                        # force_append_annotation = True
6105                        # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)):
6106                        if annotation_field in parquet_hdr_vcf_header_infos and (
6107                            force_update_annotation
6108                            or force_append_annotation
6109                            or (
6110                                annotation_fields_new_name
6111                                not in self.get_header().infos
6112                            )
6113                        ):
6114
6115                            # Add field to annotation to process list
6116                            annotation_fields_processed.append(
6117                                annotation_fields_new_name
6118                            )
6119
6120                            # explode infos for the field
6121                            annotation_fields_new_name_info_msg = ""
6122                            if (
6123                                force_update_annotation
6124                                and annotation_fields_new_name
6125                                in self.get_header().infos
6126                            ):
6127                                # Remove field from INFO
6128                                query = f"""
6129                                    UPDATE {table_variants} as table_variants
6130                                    SET INFO = REGEXP_REPLACE(
6131                                                concat(table_variants.INFO,''),
6132                                                ';*{annotation_fields_new_name}=[^;]*',
6133                                                ''
6134                                                )
6135                                    WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%'
6136                                """
6137                                annotation_fields_new_name_info_msg = " [update]"
6138                                query_dict_remove[
6139                                    f"remove 'INFO/{annotation_fields_new_name}'"
6140                                ] = query
6141
6142                            # Sep between fields in INFO
6143                            nb_annotation_field += 1
6144                            if nb_annotation_field > 1:
6145                                annotation_field_sep = ";"
6146                            else:
6147                                annotation_field_sep = ""
6148
6149                            log.info(
6150                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}"
6151                            )
6152
6153                            # Add INFO field to header
6154                            parquet_hdr_vcf_header_infos_number = (
6155                                parquet_hdr_vcf_header_infos[annotation_field].num
6156                                or "."
6157                            )
6158                            parquet_hdr_vcf_header_infos_type = (
6159                                parquet_hdr_vcf_header_infos[annotation_field].type
6160                                or "String"
6161                            )
6162                            parquet_hdr_vcf_header_infos_description = (
6163                                parquet_hdr_vcf_header_infos[annotation_field].desc
6164                                or f"{annotation_field} description"
6165                            )
6166                            parquet_hdr_vcf_header_infos_source = (
6167                                parquet_hdr_vcf_header_infos[annotation_field].source
6168                                or "unknown"
6169                            )
6170                            parquet_hdr_vcf_header_infos_version = (
6171                                parquet_hdr_vcf_header_infos[annotation_field].version
6172                                or "unknown"
6173                            )
6174
6175                            vcf_reader.infos[annotation_fields_new_name] = (
6176                                vcf.parser._Info(
6177                                    annotation_fields_new_name,
6178                                    parquet_hdr_vcf_header_infos_number,
6179                                    parquet_hdr_vcf_header_infos_type,
6180                                    parquet_hdr_vcf_header_infos_description,
6181                                    parquet_hdr_vcf_header_infos_source,
6182                                    parquet_hdr_vcf_header_infos_version,
6183                                    self.code_type_map[
6184                                        parquet_hdr_vcf_header_infos_type
6185                                    ],
6186                                )
6187                            )
6188
6189                            # Append
6190                            if force_append_annotation:
6191                                query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """
6192                            else:
6193                                query_case_when_append = ""
6194
6195                            # Annotation/Update query fields
6196                            # Found in INFO column
6197                            if (
6198                                annotation_field_column == "INFO"
6199                                and "INFO" in parquet_hdr_vcf_header_columns
6200                            ):
6201                                sql_query_annotation_update_info_sets.append(
6202                                    f"""
6203                                CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append}
6204                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1))
6205                                        ELSE ''
6206                                    END
6207                                """
6208                                )
6209                            # Found in a specific column
6210                            else:
6211                                sql_query_annotation_update_info_sets.append(
6212                                    f"""
6213                                CASE WHEN CAST(table_parquet."{annotation_field_column}" AS VARCHAR) NOT IN ('','.') {query_case_when_append}
6214                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(CAST(table_parquet."{annotation_field_column}" AS VARCHAR), ';', ','))
6215                                        ELSE ''
6216                                    END
6217                                """
6218                                )
6219                                sql_query_annotation_to_agregate.append(
6220                                    f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """
6221                                )
6222
6223                        # Not to annotate
6224                        else:
6225
6226                            if force_update_annotation:
6227                                annotation_message = "forced"
6228                            else:
6229                                annotation_message = "skipped"
6230
6231                            if annotation_field not in parquet_hdr_vcf_header_infos:
6232                                log.warning(
6233                                    f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file"
6234                                )
6235                            if annotation_fields_new_name in self.get_header().infos:
6236                                log.warning(
6237                                    f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})"
6238                                )
6239
6240                    # Check if ALL fields have to be annotated. Thus concat all INFO field
6241                    # allow_annotation_full_info = True
6242                    allow_annotation_full_info = not force_append_annotation
6243
6244                    if parquet_type in ["regions"]:
6245                        allow_annotation_full_info = False
6246
6247                    if (
6248                        allow_annotation_full_info
6249                        and nb_annotation_field == len(annotation_fields)
6250                        and annotation_fields_all
6251                        and (
6252                            "INFO" in parquet_hdr_vcf_header_columns
6253                            and "INFO" in database.get_extra_columns()
6254                        )
6255                    ):
6256                        log.debug("Column INFO annotation enabled")
6257                        sql_query_annotation_update_info_sets = []
6258                        sql_query_annotation_update_info_sets.append(
6259                            f" table_parquet.INFO "
6260                        )
6261
6262                    if sql_query_annotation_update_info_sets:
6263
6264                        # Annotate
6265                        log.info(f"Annotation '{annotation_name}' - Annotation...")
6266
6267                        # Join query annotation update info sets for SQL
6268                        sql_query_annotation_update_info_sets_sql = ",".join(
6269                            sql_query_annotation_update_info_sets
6270                        )
6271
6272                        # Check chromosomes list (and variants infos)
6273                        sql_query_chromosomes = f"""
6274                            SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants
6275                            FROM {table_variants} as table_variants
6276                            GROUP BY table_variants."#CHROM"
6277                            ORDER BY table_variants."#CHROM"
6278                            """
6279                        sql_query_chromosomes_df = self.conn.execute(
6280                            sql_query_chromosomes
6281                        ).df()
6282                        sql_query_chromosomes_dict = {
6283                            entry["CHROM"]: {
6284                                "count": entry["count_variants"],
6285                                "min": entry["min_variants"],
6286                                "max": entry["max_variants"],
6287                            }
6288                            for index, entry in sql_query_chromosomes_df.iterrows()
6289                        }
6290
6291                        # Init
6292                        nb_of_query = 0
6293                        nb_of_variant_annotated = 0
6294                        query_dict = query_dict_remove
6295
6296                        # for chrom in sql_query_chromosomes_df["CHROM"]:
6297                        for chrom in sql_query_chromosomes_dict:
6298
6299                            # Number of variant by chromosome
6300                            nb_of_variant_by_chrom = sql_query_chromosomes_dict.get(
6301                                chrom, {}
6302                            ).get("count", 0)
6303
6304                            log.debug(
6305                                f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..."
6306                            )
6307
6308                            # Annotation with regions database
6309                            if parquet_type in ["regions"]:
6310                                sql_query_annotation_from_clause = f"""
6311                                    FROM (
6312                                        SELECT 
6313                                            '{chrom}' AS \"#CHROM\",
6314                                            table_variants_from.\"POS\" AS \"POS\",
6315                                            {",".join(sql_query_annotation_to_agregate)}
6316                                        FROM {table_variants} as table_variants_from
6317                                        LEFT JOIN {parquet_file_link} as table_parquet_from ON (
6318                                            table_parquet_from."#CHROM" = '{chrom}'
6319                                            AND table_variants_from.\"POS\" <= table_parquet_from.\"END\"
6320                                            AND table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1)
6321                                        )
6322                                        WHERE table_variants_from.\"#CHROM\" in ('{chrom}')
6323                                        GROUP BY table_variants_from.\"POS\"
6324                                        )
6325                                        as table_parquet
6326                                """
6327
6328                                sql_query_annotation_where_clause = """
6329                                    table_parquet.\"#CHROM\" = table_variants.\"#CHROM\"
6330                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
6331                                """
6332
6333                            # Annotation with variants database
6334                            else:
6335                                sql_query_annotation_from_clause = f"""
6336                                    FROM {parquet_file_link} as table_parquet
6337                                """
6338                                sql_query_annotation_where_clause = f"""
6339                                    table_variants."#CHROM" = '{chrom}'
6340                                    AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 
6341                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
6342                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
6343                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
6344                                """
6345
6346                            # Create update query
6347                            sql_query_annotation_chrom_interval_pos = f"""
6348                                UPDATE {table_variants} as table_variants
6349                                    SET INFO = 
6350                                        concat(
6351                                            CASE WHEN table_variants.INFO NOT IN ('','.')
6352                                                THEN table_variants.INFO
6353                                                ELSE ''
6354                                            END
6355                                            ,
6356                                            CASE WHEN table_variants.INFO NOT IN ('','.')
6357                                                        AND (
6358                                                        concat({sql_query_annotation_update_info_sets_sql})
6359                                                        )
6360                                                        NOT IN ('','.') 
6361                                                    THEN ';'
6362                                                    ELSE ''
6363                                            END
6364                                            ,
6365                                            {sql_query_annotation_update_info_sets_sql}
6366                                            )
6367                                    {sql_query_annotation_from_clause}
6368                                    WHERE {sql_query_annotation_where_clause}
6369                                    ;
6370                                """
6371
6372                            # Add update query to dict
6373                            query_dict[
6374                                f"{chrom} [{nb_of_variant_by_chrom} variants]"
6375                            ] = sql_query_annotation_chrom_interval_pos
6376
6377                        nb_of_query = len(query_dict)
6378                        num_query = 0
6379
6380                        # SET max_expression_depth TO x
6381                        self.conn.execute("SET max_expression_depth TO 10000")
6382
6383                        for query_name in query_dict:
6384                            query = query_dict[query_name]
6385                            num_query += 1
6386                            log.info(
6387                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..."
6388                            )
6389                            result = self.conn.execute(query)
6390                            nb_of_variant_annotated_by_query = result.df()["Count"][0]
6391                            nb_of_variant_annotated += nb_of_variant_annotated_by_query
6392                            log.info(
6393                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated"
6394                            )
6395
6396                        log.info(
6397                            f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)"
6398                        )
6399
6400                    else:
6401
6402                        log.info(
6403                            f"Annotation '{annotation_name}' - No Annotations available"
6404                        )
6405
6406                    log.debug("Final header: " + str(vcf_reader.infos))
6407
6408        # Remove added columns
6409        for added_column in added_columns:
6410            self.drop_column(column=added_column)

It takes a VCF file, and annotates it with a parquet file

Parameters
  • threads: number of threads to use for the annotation
Returns

the value of the variable "result".

def annotation_splice(self, threads: int = None) -> None:
6412    def annotation_splice(self, threads: int = None) -> None:
6413        """
6414        This function annotate with snpEff
6415
6416        :param threads: The number of threads to use
6417        :return: the value of the variable "return_value".
6418        """
6419
6420        # DEBUG
6421        log.debug("Start annotation with splice tools")
6422
6423        # Threads
6424        if not threads:
6425            threads = self.get_threads()
6426        log.debug("Threads: " + str(threads))
6427
6428        # DEBUG
6429        delete_tmp = True
6430        if self.get_config().get("verbosity", "warning") in ["debug"]:
6431            delete_tmp = False
6432            log.debug("Delete tmp files/folders: " + str(delete_tmp))
6433
6434        # Config
6435        config = self.get_config()
6436        log.debug("Config: " + str(config))
6437        splice_config = config.get("tools", {}).get("splice", {})
6438        if not splice_config:
6439            splice_config = DEFAULT_TOOLS_BIN.get("splice", {})
6440            msg_err = "No Splice tool config"
6441            raise ValueError(msg_err)
6442        log.debug(f"splice_config: {splice_config}")
6443
6444        # Config - Folders - Databases
6445        databases_folders = (
6446            config.get("folders", {}).get("databases", {}).get("splice", ["."])
6447        )
6448        log.debug("Databases annotations: " + str(databases_folders))
6449
6450        # Splice docker image
6451        splice_docker_image = splice_config.get("docker").get("image")
6452
6453        # Pull splice image if it's not already there
6454        if not check_docker_image_exists(splice_docker_image):
6455            log.warning(
6456                f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub"
6457            )
6458            try:
6459                command(f"docker pull {splice_config.get('docker').get('image')}")
6460            except subprocess.CalledProcessError:
6461                msg_err = f"Unable to find docker {splice_docker_image} on dockerhub"
6462                log.error(msg_err)
6463                raise ValueError(msg_err)
6464
6465        # Config - splice databases
6466        splice_databases = (
6467            config.get("folders", {})
6468            .get("databases", {})
6469            .get("splice", DEFAULT_SPLICE_FOLDER)
6470        )
6471        splice_databases = full_path(splice_databases)
6472
6473        # Param
6474        param = self.get_param()
6475        log.debug("Param: " + str(param))
6476
6477        # Param
6478        options = param.get("annotation", {}).get("splice", {}).get("options", {})
6479        log.debug("Options: " + str(options))
6480
6481        # Data
6482        table_variants = self.get_table_variants()
6483
6484        # Check if not empty
6485        log.debug("Check if not empty")
6486        sql_query_chromosomes = (
6487            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
6488        )
6489        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
6490            log.info("VCF empty")
6491            return None
6492
6493        # Export in VCF
6494        log.debug("Create initial file to annotate")
6495
6496        # Create output folder / work folder
6497        if options.get("output_folder", ""):
6498            output_folder = options.get("output_folder", "")
6499            if not os.path.exists(output_folder):
6500                Path(output_folder).mkdir(parents=True, exist_ok=True)
6501        else:
6502            output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}")
6503            if not os.path.exists(output_folder):
6504                Path(output_folder).mkdir(parents=True, exist_ok=True)
6505
6506        if options.get("workdir", ""):
6507            workdir = options.get("workdir", "")
6508        else:
6509            workdir = "/work"
6510
6511        # Create tmp VCF file
6512        tmp_vcf = NamedTemporaryFile(
6513            prefix=self.get_prefix(),
6514            dir=output_folder,
6515            suffix=".vcf",
6516            delete=False,
6517        )
6518        tmp_vcf_name = tmp_vcf.name
6519
6520        # VCF header
6521        header = self.get_header()
6522
6523        # Existing annotations
6524        for vcf_annotation in self.get_header().infos:
6525
6526            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
6527            log.debug(
6528                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
6529            )
6530
6531        # Memory limit
6532        if config.get("memory", None):
6533            memory_limit = config.get("memory", "8G").upper()
6534            # upper()
6535        else:
6536            memory_limit = "8G"
6537        log.debug(f"memory_limit: {memory_limit}")
6538
6539        # Check number of variants to annotate
6540        where_clause_regex_spliceai = r"SpliceAI_\w+"
6541        where_clause_regex_spip = r"SPiP_\w+"
6542        where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')"""
6543        df_list_of_variants_to_annotate = self.get_query_to_df(
6544            query=f""" SELECT * FROM variants {where_clause} """
6545        )
6546        if len(df_list_of_variants_to_annotate) == 0:
6547            log.warning(
6548                f"No variants to annotate with splice. Variants probably already annotated with splice"
6549            )
6550            return None
6551        else:
6552            log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants")
6553
6554        # Export VCF file
6555        self.export_variant_vcf(
6556            vcf_file=tmp_vcf_name,
6557            remove_info=True,
6558            add_samples=True,
6559            index=False,
6560            where_clause=where_clause,
6561        )
6562        mount = [f" -v {path}:{path}:rw" for path in [output_folder]]
6563        if any(value for value in splice_config.values() if value is None):
6564            log.warning("At least one splice config parameter is empty")
6565            # exit annotation_splice
6566            return None
6567
6568        # Params in splice nf
6569        def check_values(dico: dict):
6570            """
6571            Ensure parameters for NF splice pipeline
6572            """
6573            for key, val in dico.items():
6574                if key == "genome":
6575                    if any(
6576                        assemb in options.get("genome", {})
6577                        for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"]
6578                    ):
6579                        yield f"--{key} hg19"
6580                    elif any(
6581                        assemb in options.get("genome", {})
6582                        for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"]
6583                    ):
6584                        yield f"--{key} hg38"
6585                elif (
6586                    (isinstance(val, str) and val)
6587                    or isinstance(val, int)
6588                    or isinstance(val, bool)
6589                ):
6590                    yield f"--{key} {val}"
6591
6592        # Genome
6593        genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY))
6594        options["genome"] = genome
6595        # NF params
6596        nf_params = []
6597        # Add options
6598        if options:
6599            log.debug(options)
6600            nf_params = list(check_values(options))
6601            log.debug(f"Splice NF params: {' '.join(nf_params)}")
6602        else:
6603            log.debug("No NF params provided")
6604        # Add threads
6605        if "threads" not in options.keys():
6606            nf_params.append(f"--threads {threads}")
6607        # Genome path
6608        genome_path = find_genome(
6609            config.get("folders", {})
6610            .get("databases", {})
6611            .get("genomes", DEFAULT_GENOME_FOLDER),
6612            file=f"{genome}.fa",
6613        )
6614        # Add genome path
6615        if not genome_path:
6616            raise ValueError(
6617                f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}"
6618            )
6619        else:
6620            log.debug(f"Genome: {genome_path}")
6621            nf_params.append(f"--genome_path {genome_path}")
6622
6623        def splice_annotations(options: dict = {}, config: dict = {}) -> list:
6624            """
6625            Setting up updated databases for SPiP and SpliceAI
6626            """
6627
6628            try:
6629
6630                # SpliceAI assembly transcriptome
6631                spliceai_assembly = os.path.join(
6632                    config.get("folders", {}).get("databases", {}).get("spliceai", {}),
6633                    options.get("genome"),
6634                    "transcriptome",
6635                )
6636                spip_assembly = options.get("genome")
6637
6638                spip = find(
6639                    f"transcriptome_{spip_assembly}.RData",
6640                    config.get("folders", {}).get("databases", {}).get("spip", {}),
6641                )
6642                spliceai = find("spliceai.refseq.txt", spliceai_assembly)
6643                log.debug(f"SPiP annotations: {spip}")
6644                log.debug(f"SpliceAI annotations: {spliceai}")
6645                if spip and spliceai:
6646                    return [
6647                        f"--spip_transcriptome {spip}",
6648                        f"--spliceai_transcriptome {spliceai}",
6649                    ]
6650                else:
6651                    log.warning(
6652                        "Can't find splice databases in configuration, use annotations file from image"
6653                    )
6654            except TypeError:
6655                log.warning(
6656                    "Can't find splice databases in configuration, use annotations file from image"
6657                )
6658                return []
6659
6660        # Add options, check if transcriptome option have already beend provided
6661        if (
6662            "spip_transcriptome" not in nf_params
6663            and "spliceai_transcriptome" not in nf_params
6664        ):
6665            splice_reference = splice_annotations(options, config)
6666            if splice_reference:
6667                nf_params.extend(splice_reference)
6668        # nf_params.append(f"--output_folder {output_folder}")
6669        random_uuid = f"HOWARD-SPLICE-{get_random()}"
6670        cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline"
6671        log.debug(cmd)
6672        splice_config["docker"]["command"] = cmd
6673
6674        # Ensure proxy is set
6675        proxy = [
6676            f"-e {var}={os.getenv(var)}"
6677            for var in ["https_proxy", "http_proxy", "ftp_proxy"]
6678            if os.getenv(var) is not None
6679        ]
6680        docker_cmd = get_bin_command(
6681            tool="splice",
6682            bin_type="docker",
6683            config=config,
6684            default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker",
6685            add_options=f"--name {random_uuid} {' '.join(mount)} -e NXF_DISABLE_CHECK_LATEST=true {' '.join(proxy)}",
6686        )
6687        # print(docker_cmd)
6688        # exit()
6689        # Docker debug
6690        # if splice_config.get("rm_container"):
6691        #     rm_container = "--rm"
6692        # else:
6693        #     rm_container = ""
6694        # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}"
6695        log.debug(docker_cmd)
6696        res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True)
6697        log.debug(res.stdout)
6698        if res.stderr:
6699            log.error(res.stderr)
6700        res.check_returncode()
6701        # Update variants
6702        log.info("Annotation - Updating...")
6703        # Test find output vcf
6704        log.debug(
6705            f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
6706        )
6707        output_vcf = []
6708        # Wrong folder to look in
6709        for files in os.listdir(os.path.dirname(tmp_vcf_name)):
6710            if (
6711                files
6712                == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
6713            ):
6714                output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files))
6715        # log.debug(os.listdir(options.get("output_folder")))
6716        log.debug(f"Splice annotated vcf: {output_vcf[0]}")
6717        if not output_vcf:
6718            log.debug(
6719                f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz"
6720            )
6721        else:
6722            # Get new header from annotated vcf
6723            log.debug(f"Initial header: {len(header.infos)} fields")
6724            # Create new header with splice infos
6725            new_vcf = Variants(input=output_vcf[0])
6726            new_vcf_header = new_vcf.get_header().infos
6727            for keys, infos in new_vcf_header.items():
6728                if keys not in header.infos.keys():
6729                    header.infos[keys] = infos
6730            log.debug(f"New header: {len(header.infos)} fields")
6731            log.debug(f"Splice tmp output: {output_vcf[0]}")
6732            self.update_from_vcf(output_vcf[0])
6733
6734        # Remove file
6735        remove_if_exists(output_vcf)

This function annotate with snpEff

Parameters
  • threads: The number of threads to use
Returns

the value of the variable "return_value".

def get_config_default(self, name: str) -> dict:
6741    def get_config_default(self, name: str) -> dict:
6742        """
6743        The function `get_config_default` returns a dictionary containing default configurations for
6744        various calculations and prioritizations.
6745
6746        :param name: The `get_config_default` function returns a dictionary containing default
6747        configurations for different calculations and prioritizations. The `name` parameter is used to
6748        specify which specific configuration to retrieve from the dictionary
6749        :type name: str
6750        :return: The function `get_config_default` returns a dictionary containing default configuration
6751        settings for different calculations and prioritizations. The specific configuration settings are
6752        retrieved based on the input `name` parameter provided to the function. If the `name` parameter
6753        matches a key in the `config_default` dictionary, the corresponding configuration settings are
6754        returned. If there is no match, an empty dictionary is returned.
6755        """
6756
6757        config_default = {
6758            "calculations": {
6759                "variant_chr_pos_alt_ref": {
6760                    "type": "sql",
6761                    "name": "variant_chr_pos_alt_ref",
6762                    "description": "Create a variant ID with chromosome, position, alt and ref",
6763                    "available": False,
6764                    "output_column_name": "variant_chr_pos_alt_ref",
6765                    "output_column_type": "String",
6766                    "output_column_description": "variant ID with chromosome, position, alt and ref",
6767                    "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """,
6768                    "operation_info": True,
6769                },
6770                "VARTYPE": {
6771                    "type": "sql",
6772                    "name": "VARTYPE",
6773                    "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)",
6774                    "available": True,
6775                    "table": "variants",
6776                    "output_column_name": "VARTYPE",
6777                    "output_column_type": "String",
6778                    "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ",
6779                    "operation_query": """
6780                            CASE
6781                                WHEN "SVTYPE" NOT NULL THEN "SVTYPE"
6782                                WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV'
6783                                WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC'
6784                                WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV'
6785                                WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL'
6786                                ELSE 'UNDEFINED'
6787                            END
6788                            """,
6789                    "info_fields": ["SVTYPE"],
6790                    "operation_info": True,
6791                },
6792                "snpeff_hgvs": {
6793                    "type": "python",
6794                    "name": "snpeff_hgvs",
6795                    "description": "HGVS nomenclatures from snpEff annotation",
6796                    "available": True,
6797                    "function_name": "calculation_extract_snpeff_hgvs",
6798                    "function_params": ["snpeff_hgvs", "ANN"],
6799                },
6800                "snpeff_ann_explode": {
6801                    "type": "python",
6802                    "name": "snpeff_ann_explode",
6803                    "description": "Explode snpEff annotations with uniquify values",
6804                    "available": True,
6805                    "function_name": "calculation_snpeff_ann_explode",
6806                    "function_params": [False, "fields", "snpeff_", "ANN"],
6807                },
6808                "snpeff_ann_explode_uniquify": {
6809                    "type": "python",
6810                    "name": "snpeff_ann_explode_uniquify",
6811                    "description": "Explode snpEff annotations",
6812                    "available": True,
6813                    "function_name": "calculation_snpeff_ann_explode",
6814                    "function_params": [True, "fields", "snpeff_uniquify_", "ANN"],
6815                },
6816                "snpeff_ann_explode_json": {
6817                    "type": "python",
6818                    "name": "snpeff_ann_explode_json",
6819                    "description": "Explode snpEff annotations in JSON format",
6820                    "available": True,
6821                    "function_name": "calculation_snpeff_ann_explode",
6822                    "function_params": [False, "JSON", "snpeff_json", "ANN"],
6823                },
6824                "NOMEN": {
6825                    "type": "python",
6826                    "name": "NOMEN",
6827                    "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field (see parameters help)",
6828                    "available": True,
6829                    "function_name": "calculation_extract_nomen",
6830                    "function_params": [],
6831                },
6832                "FINDBYPIPELINE": {
6833                    "type": "python",
6834                    "name": "FINDBYPIPELINE",
6835                    "description": "Number of pipeline that identify the variant (for multi pipeline VCF)",
6836                    "available": True,
6837                    "function_name": "calculation_find_by_pipeline",
6838                    "function_params": ["findbypipeline"],
6839                },
6840                "FINDBYSAMPLE": {
6841                    "type": "python",
6842                    "name": "FINDBYSAMPLE",
6843                    "description": "Number of sample that have a genotype for the variant (for multi sample VCF)",
6844                    "available": True,
6845                    "function_name": "calculation_find_by_pipeline",
6846                    "function_params": ["findbysample"],
6847                },
6848                "GENOTYPECONCORDANCE": {
6849                    "type": "python",
6850                    "name": "GENOTYPECONCORDANCE",
6851                    "description": "Concordance of genotype for multi caller VCF",
6852                    "available": True,
6853                    "function_name": "calculation_genotype_concordance",
6854                    "function_params": [],
6855                },
6856                "BARCODE": {
6857                    "type": "python",
6858                    "name": "BARCODE",
6859                    "description": "BARCODE as VaRank tool",
6860                    "available": True,
6861                    "function_name": "calculation_barcode",
6862                    "function_params": [],
6863                },
6864                "BARCODEFAMILY": {
6865                    "type": "python",
6866                    "name": "BARCODEFAMILY",
6867                    "description": "BARCODEFAMILY as VaRank tool",
6868                    "available": True,
6869                    "function_name": "calculation_barcode_family",
6870                    "function_params": ["BCF"],
6871                },
6872                "TRIO": {
6873                    "type": "python",
6874                    "name": "TRIO",
6875                    "description": "Inheritance for a trio family",
6876                    "available": True,
6877                    "function_name": "calculation_trio",
6878                    "function_params": [],
6879                },
6880                "VAF": {
6881                    "type": "python",
6882                    "name": "VAF",
6883                    "description": "Variant Allele Frequency (VAF) harmonization",
6884                    "available": True,
6885                    "function_name": "calculation_vaf_normalization",
6886                    "function_params": [],
6887                },
6888                "VAF_stats": {
6889                    "type": "python",
6890                    "name": "VAF_stats",
6891                    "description": "Variant Allele Frequency (VAF) statistics",
6892                    "available": True,
6893                    "function_name": "calculation_genotype_stats",
6894                    "function_params": ["VAF"],
6895                },
6896                "DP_stats": {
6897                    "type": "python",
6898                    "name": "DP_stats",
6899                    "description": "Depth (DP) statistics",
6900                    "available": True,
6901                    "function_name": "calculation_genotype_stats",
6902                    "function_params": ["DP"],
6903                },
6904                "variant_id": {
6905                    "type": "python",
6906                    "name": "variant_id",
6907                    "description": "Variant ID generated from variant position and type",
6908                    "available": True,
6909                    "function_name": "calculation_variant_id",
6910                    "function_params": [],
6911                },
6912                "transcripts_json": {
6913                    "type": "python",
6914                    "name": "transcripts_json",
6915                    "description": "Add transcripts annotations in JSON format (field 'transcripts_json')",
6916                    "available": True,
6917                    "function_name": "calculation_transcripts_annotation",
6918                    "function_params": ["transcripts_json", None],
6919                },
6920                "transcripts_ann": {
6921                    "type": "python",
6922                    "name": "transcripts_ann",
6923                    "description": "Add transcripts annotations in structured format (field 'transcripts_ann')",
6924                    "available": True,
6925                    "function_name": "calculation_transcripts_annotation",
6926                    "function_params": [None, "transcripts_ann"],
6927                },
6928                "transcripts_annotations": {
6929                    "type": "python",
6930                    "name": "transcripts_annotations",
6931                    "description": "Add transcripts annotations in JSON and/or structured format (see param JSON file)",
6932                    "available": True,
6933                    "function_name": "calculation_transcripts_annotation",
6934                    "function_params": [None, None],
6935                },
6936                "transcripts_prioritization": {
6937                    "type": "python",
6938                    "name": "transcripts_prioritization",
6939                    "description": "Prioritize transcripts with a prioritization profile (using param.json)",
6940                    "available": True,
6941                    "function_name": "calculation_transcripts_prioritization",
6942                    "function_params": [],
6943                },
6944                "transcripts_export": {
6945                    "type": "python",
6946                    "name": "transcripts_export",
6947                    "description": "Export transcripts table/view as a file (using param.json)",
6948                    "available": True,
6949                    "function_name": "calculation_transcripts_export",
6950                    "function_params": [],
6951                },
6952            },
6953            "prioritizations": {
6954                "default": {
6955                    "ANN2": [
6956                        {
6957                            "type": "contains",
6958                            "value": "HIGH",
6959                            "score": 5,
6960                            "flag": "PASS",
6961                            "comment": [
6962                                "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay"
6963                            ],
6964                        },
6965                        {
6966                            "type": "contains",
6967                            "value": "MODERATE",
6968                            "score": 3,
6969                            "flag": "PASS",
6970                            "comment": [
6971                                "A non-disruptive variant that might change protein effectiveness"
6972                            ],
6973                        },
6974                        {
6975                            "type": "contains",
6976                            "value": "LOW",
6977                            "score": 0,
6978                            "flag": "FILTERED",
6979                            "comment": [
6980                                "Assumed to be mostly harmless or unlikely to change protein behavior"
6981                            ],
6982                        },
6983                        {
6984                            "type": "contains",
6985                            "value": "MODIFIER",
6986                            "score": 0,
6987                            "flag": "FILTERED",
6988                            "comment": [
6989                                "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact"
6990                            ],
6991                        },
6992                    ],
6993                }
6994            },
6995        }
6996
6997        return config_default.get(name, None)

The function get_config_default returns a dictionary containing default configurations for various calculations and prioritizations.

Parameters
  • name: The get_config_default function returns a dictionary containing default configurations for different calculations and prioritizations. The name parameter is used to specify which specific configuration to retrieve from the dictionary
Returns

The function get_config_default returns a dictionary containing default configuration settings for different calculations and prioritizations. The specific configuration settings are retrieved based on the input name parameter provided to the function. If the name parameter matches a key in the config_default dictionary, the corresponding configuration settings are returned. If there is no match, an empty dictionary is returned.

def get_config_json(self, name: str, config_dict: dict = {}, config_file: str = None) -> dict:
6999    def get_config_json(
7000        self, name: str, config_dict: dict = {}, config_file: str = None
7001    ) -> dict:
7002        """
7003        The function `get_config_json` retrieves a configuration JSON object with prioritizations from
7004        default values, a dictionary, and a file.
7005
7006        :param name: The `name` parameter in the `get_config_json` function is a string that represents
7007        the name of the configuration. It is used to identify and retrieve the configuration settings
7008        for a specific component or module
7009        :type name: str
7010        :param config_dict: The `config_dict` parameter in the `get_config_json` function is a
7011        dictionary that allows you to provide additional configuration settings or overrides. When you
7012        call the `get_config_json` function, you can pass a dictionary containing key-value pairs where
7013        the key is the configuration setting you want to override or
7014        :type config_dict: dict
7015        :param config_file: The `config_file` parameter in the `get_config_json` function is used to
7016        specify the path to a configuration file that contains additional settings. If provided, the
7017        function will read the contents of this file and update the configuration dictionary with the
7018        values found in the file, overriding any existing values with the
7019        :type config_file: str
7020        :return: The function `get_config_json` returns a dictionary containing the configuration
7021        settings.
7022        """
7023
7024        # Create with default prioritizations
7025        config_default = self.get_config_default(name=name)
7026        configuration = config_default
7027        # log.debug(f"configuration={configuration}")
7028
7029        # Replace prioritizations from dict
7030        for config in config_dict:
7031            configuration[config] = config_dict[config]
7032
7033        # Replace prioritizations from file
7034        config_file = full_path(config_file)
7035        if config_file:
7036            if os.path.exists(config_file):
7037                with open(config_file) as config_file_content:
7038                    config_file_dict = json.load(config_file_content)
7039                for config in config_file_dict:
7040                    configuration[config] = config_file_dict[config]
7041            else:
7042                msg_error = f"Config '{name}' file '{config_file}' does NOT exist"
7043                log.error(msg_error)
7044                raise ValueError(msg_error)
7045
7046        return configuration

The function get_config_json retrieves a configuration JSON object with prioritizations from default values, a dictionary, and a file.

Parameters
  • name: The name parameter in the get_config_json function is a string that represents the name of the configuration. It is used to identify and retrieve the configuration settings for a specific component or module
  • config_dict: The config_dict parameter in the get_config_json function is a dictionary that allows you to provide additional configuration settings or overrides. When you call the get_config_json function, you can pass a dictionary containing key-value pairs where the key is the configuration setting you want to override or
  • config_file: The config_file parameter in the get_config_json function is used to specify the path to a configuration file that contains additional settings. If provided, the function will read the contents of this file and update the configuration dictionary with the values found in the file, overriding any existing values with the
Returns

The function get_config_json returns a dictionary containing the configuration settings.

def prioritization( self, table: str = None, pz_prefix: str = None, pz_param: dict = None) -> bool:
7048    def prioritization(
7049        self, table: str = None, pz_prefix: str = None, pz_param: dict = None
7050    ) -> bool:
7051        """
7052        The `prioritization` function in Python processes VCF files, adds new INFO fields, and
7053        prioritizes variants based on configured profiles and criteria.
7054
7055        :param table: The `table` parameter in the `prioritization` function is used to specify the name
7056        of the table (presumably a VCF file) on which the prioritization operation will be performed. If
7057        a table name is provided, the method will prioritize the variants in that specific table
7058        :type table: str
7059        :param pz_prefix: The `pz_prefix` parameter is used to specify a prefix that will be added to
7060        certain INFO fields in a VCF file during the prioritization process. If this parameter is not
7061        provided, the code will use a default prefix value of "PZ"
7062        :type pz_prefix: str
7063        :param pz_param: The `pz_param` parameter in the `prioritization` method is used to pass
7064        additional parameters specific to the prioritization process. These parameters can include
7065        settings related to prioritization profiles, fields, scoring modes, flags, comments, and other
7066        configurations needed for the prioritization of variants in a V
7067        :type pz_param: dict
7068        :return: A boolean value (True) is being returned from the `prioritization` function.
7069        """
7070
7071        # Config
7072        config = self.get_config()
7073
7074        # Param
7075        param = self.get_param()
7076
7077        # Prioritization param
7078        if pz_param is not None:
7079            prioritization_param = pz_param
7080        else:
7081            prioritization_param = param.get("prioritization", {})
7082
7083        # Configuration profiles
7084        prioritization_config_file = prioritization_param.get(
7085            "prioritization_config", None
7086        )
7087        prioritization_config_file = full_path(prioritization_config_file)
7088        prioritizations_config = self.get_config_json(
7089            name="prioritizations", config_file=prioritization_config_file
7090        )
7091
7092        # Prioritization prefix
7093        pz_prefix_default = "PZ"
7094        if pz_prefix is None:
7095            pz_prefix = prioritization_param.get("pzprefix", pz_prefix_default)
7096
7097        # Prioritization options
7098        profiles = prioritization_param.get("profiles", [])
7099        if isinstance(profiles, str):
7100            profiles = profiles.split(",")
7101        pzfields = prioritization_param.get(
7102            "pzfields", [f"{pz_prefix}Flag", f"{pz_prefix}Score"]
7103        )
7104        if isinstance(pzfields, str):
7105            pzfields = pzfields.split(",")
7106        default_profile = prioritization_param.get("default_profile", None)
7107        pzfields_sep = prioritization_param.get("pzfields_sep", "_")
7108        prioritization_score_mode = prioritization_param.get(
7109            "prioritization_score_mode", "HOWARD"
7110        )
7111
7112        # Quick Prioritizations
7113        prioritizations = param.get("prioritizations", None)
7114        if prioritizations:
7115            log.info("Quick Prioritization:")
7116            for profile in prioritizations.split(","):
7117                if profile not in profiles:
7118                    profiles.append(profile)
7119                    log.info(f"   {profile}")
7120
7121        # If profile "ALL" provided, all profiles in the config profiles
7122        if "ALL" in profiles:
7123            profiles = list(prioritizations_config.keys())
7124
7125        for profile in profiles:
7126            if prioritizations_config.get(profile, None):
7127                log.debug(f"Profile '{profile}' configured")
7128            else:
7129                msg_error = f"Profile '{profile}' NOT configured"
7130                log.error(msg_error)
7131                raise ValueError(msg_error)
7132
7133        if profiles:
7134            log.info(f"Prioritization... ")
7135        else:
7136            log.debug(f"No profile defined")
7137            return False
7138
7139        if not default_profile and len(profiles):
7140            default_profile = profiles[0]
7141
7142        log.debug("Profiles availables: " + str(list(prioritizations_config.keys())))
7143        log.debug("Profiles to check: " + str(list(profiles)))
7144
7145        # Variables
7146        if table is not None:
7147            table_variants = table
7148        else:
7149            table_variants = self.get_table_variants(clause="update")
7150        log.debug(f"Table to prioritize: {table_variants}")
7151
7152        # Added columns
7153        added_columns = []
7154
7155        # Create list of PZfields
7156        # List of PZFields
7157        list_of_pzfields_original = pzfields + [
7158            pzfield + pzfields_sep + profile
7159            for pzfield in pzfields
7160            for profile in profiles
7161        ]
7162        list_of_pzfields = []
7163        log.debug(f"{list_of_pzfields_original}")
7164
7165        # Remove existing PZfields to use if exists
7166        for pzfield in list_of_pzfields_original:
7167            if self.get_header().infos.get(pzfield, None) is None:
7168                list_of_pzfields.append(pzfield)
7169                log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF")
7170            else:
7171                log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF")
7172
7173        if list_of_pzfields:
7174
7175            # Explode Infos prefix
7176            explode_infos_prefix = self.get_explode_infos_prefix()
7177
7178            # PZfields tags description
7179            PZfields_INFOS = {
7180                f"{pz_prefix}Tags": {
7181                    "ID": f"{pz_prefix}Tags",
7182                    "Number": ".",
7183                    "Type": "String",
7184                    "Description": "Variant tags based on annotation criteria",
7185                },
7186                f"{pz_prefix}Score": {
7187                    "ID": f"{pz_prefix}Score",
7188                    "Number": 1,
7189                    "Type": "Integer",
7190                    "Description": "Variant score based on annotation criteria",
7191                },
7192                f"{pz_prefix}Flag": {
7193                    "ID": f"{pz_prefix}Flag",
7194                    "Number": 1,
7195                    "Type": "String",
7196                    "Description": "Variant flag based on annotation criteria",
7197                },
7198                f"{pz_prefix}Comment": {
7199                    "ID": f"{pz_prefix}Comment",
7200                    "Number": ".",
7201                    "Type": "String",
7202                    "Description": "Variant comment based on annotation criteria",
7203                },
7204                f"{pz_prefix}Infos": {
7205                    "ID": f"{pz_prefix}Infos",
7206                    "Number": ".",
7207                    "Type": "String",
7208                    "Description": "Variant infos based on annotation criteria",
7209                },
7210                f"{pz_prefix}Class": {
7211                    "ID": f"{pz_prefix}Class",
7212                    "Number": ".",
7213                    "Type": "String",
7214                    "Description": "Variant class based on annotation criteria",
7215                },
7216            }
7217
7218            # Create INFO fields if not exist
7219            for field in PZfields_INFOS:
7220                field_ID = PZfields_INFOS[field]["ID"]
7221                field_description = PZfields_INFOS[field]["Description"]
7222                if field_ID not in self.get_header().infos and field_ID in pzfields:
7223                    field_description = (
7224                        PZfields_INFOS[field]["Description"]
7225                        + f", profile {default_profile}"
7226                    )
7227                    self.get_header().infos[field_ID] = vcf.parser._Info(
7228                        field_ID,
7229                        PZfields_INFOS[field]["Number"],
7230                        PZfields_INFOS[field]["Type"],
7231                        field_description,
7232                        "unknown",
7233                        "unknown",
7234                        code_type_map[PZfields_INFOS[field]["Type"]],
7235                    )
7236
7237            # Create INFO fields if not exist for each profile
7238            for profile in prioritizations_config:
7239                if profile in profiles or profiles == []:
7240                    for field in PZfields_INFOS:
7241                        field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile
7242                        field_description = (
7243                            PZfields_INFOS[field]["Description"]
7244                            + f", profile {profile}"
7245                        )
7246                        if (
7247                            field_ID not in self.get_header().infos
7248                            and field in pzfields
7249                        ):
7250                            self.get_header().infos[field_ID] = vcf.parser._Info(
7251                                field_ID,
7252                                PZfields_INFOS[field]["Number"],
7253                                PZfields_INFOS[field]["Type"],
7254                                field_description,
7255                                "unknown",
7256                                "unknown",
7257                                code_type_map[PZfields_INFOS[field]["Type"]],
7258                            )
7259
7260            # Header
7261            for pzfield in list_of_pzfields:
7262                if re.match(f"{pz_prefix}Score.*", pzfield):
7263                    added_column = self.add_column(
7264                        table_name=table_variants,
7265                        column_name=pzfield,
7266                        column_type="INTEGER",
7267                        default_value="0",
7268                    )
7269                elif re.match(f"{pz_prefix}Flag.*", pzfield):
7270                    added_column = self.add_column(
7271                        table_name=table_variants,
7272                        column_name=pzfield,
7273                        column_type="BOOLEAN",
7274                        default_value="1",
7275                    )
7276                elif re.match(f"{pz_prefix}Class.*", pzfield):
7277                    added_column = self.add_column(
7278                        table_name=table_variants,
7279                        column_name=pzfield,
7280                        column_type="VARCHAR[]",
7281                        default_value="null",
7282                    )
7283                else:
7284                    added_column = self.add_column(
7285                        table_name=table_variants,
7286                        column_name=pzfield,
7287                        column_type="STRING",
7288                        default_value="''",
7289                    )
7290                added_columns.append(added_column)
7291
7292            # Profiles
7293            if profiles:
7294
7295                # foreach profile in configuration file
7296                for profile in prioritizations_config:
7297
7298                    # If profile is asked in param, or ALL are asked (empty profile [])
7299                    if profile in profiles or profiles == []:
7300                        log.info(f"Profile '{profile}'")
7301
7302                        sql_set_info_option = ""
7303
7304                        sql_set_info = []
7305
7306                        # PZ fields set
7307
7308                        # PZScore
7309                        if (
7310                            f"{pz_prefix}Score{pzfields_sep}{profile}"
7311                            in list_of_pzfields
7312                        ):
7313                            sql_set_info.append(
7314                                f"""
7315                                    concat(
7316                                        '{pz_prefix}Score{pzfields_sep}{profile}=',
7317                                        {pz_prefix}Score{pzfields_sep}{profile}
7318                                    ) 
7319                                """
7320                            )
7321                            if (
7322                                profile == default_profile
7323                                and f"{pz_prefix}Score" in list_of_pzfields
7324                            ):
7325                                sql_set_info.append(
7326                                    f"""
7327                                        concat(
7328                                            '{pz_prefix}Score=',
7329                                            {pz_prefix}Score{pzfields_sep}{profile}
7330                                        )
7331                                    """
7332                                )
7333
7334                        # PZFlag
7335                        if (
7336                            f"{pz_prefix}Flag{pzfields_sep}{profile}"
7337                            in list_of_pzfields
7338                        ):
7339                            sql_set_info.append(
7340                                f"""
7341                                    concat(
7342                                        '{pz_prefix}Flag{pzfields_sep}{profile}=',
7343                                        CASE 
7344                                            WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1
7345                                            THEN 'PASS'
7346                                            WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0
7347                                            THEN 'FILTERED'
7348                                        END
7349                                    ) 
7350                                """
7351                            )
7352                            if (
7353                                profile == default_profile
7354                                and f"{pz_prefix}Flag" in list_of_pzfields
7355                            ):
7356                                sql_set_info.append(
7357                                    f"""
7358                                        concat(
7359                                            '{pz_prefix}Flag=',
7360                                            CASE 
7361                                                WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1
7362                                                THEN 'PASS'
7363                                                WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0
7364                                                THEN 'FILTERED'
7365                                            END
7366                                        )
7367                                    """
7368                                )
7369
7370                        # PZClass
7371                        if (
7372                            f"{pz_prefix}Class{pzfields_sep}{profile}"
7373                            in list_of_pzfields
7374                        ):
7375                            sql_set_info.append(
7376                                f"""
7377                                    concat(
7378                                        '{pz_prefix}Class{pzfields_sep}{profile}=',
7379                                        CASE
7380                                            WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
7381                                            THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
7382                                            ELSE '.'
7383                                        END 
7384                                    )
7385                                    
7386                                """
7387                            )
7388                            if (
7389                                profile == default_profile
7390                                and f"{pz_prefix}Class" in list_of_pzfields
7391                            ):
7392                                sql_set_info.append(
7393                                    f"""
7394                                        concat(
7395                                            '{pz_prefix}Class=',
7396                                            CASE
7397                                                WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
7398                                                THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
7399                                                ELSE '.'
7400                                            END 
7401                                        )
7402                                    """
7403                                )
7404
7405                        # PZComment
7406                        if (
7407                            f"{pz_prefix}Comment{pzfields_sep}{profile}"
7408                            in list_of_pzfields
7409                        ):
7410                            sql_set_info.append(
7411                                f"""
7412                                    CASE
7413                                        WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('')
7414                                        THEN concat('{pz_prefix}Comment{pzfields_sep}{profile}=', {pz_prefix}Comment{pzfields_sep}{profile})
7415                                        ELSE ''
7416                                    END
7417                                """
7418                            )
7419                            if (
7420                                profile == default_profile
7421                                and f"{pz_prefix}Comment" in list_of_pzfields
7422                            ):
7423                                sql_set_info.append(
7424                                    f"""
7425                                        CASE
7426                                            WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('')
7427                                            THEN concat('{pz_prefix}Comment=', {pz_prefix}Comment{pzfields_sep}{profile})
7428                                            ELSE ''
7429                                        END
7430                                    """
7431                                )
7432
7433                        # PZInfos
7434                        if (
7435                            f"{pz_prefix}Infos{pzfields_sep}{profile}"
7436                            in list_of_pzfields
7437                        ):
7438                            sql_set_info.append(
7439                                f"""
7440                                    CASE
7441                                        WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('')
7442                                        THEN concat('{pz_prefix}Infos{pzfields_sep}{profile}=', {pz_prefix}Infos{pzfields_sep}{profile})
7443                                        ELSE ''
7444                                    END
7445                                """
7446                            )
7447                            if (
7448                                profile == default_profile
7449                                and f"{pz_prefix}Infos" in list_of_pzfields
7450                            ):
7451                                sql_set_info.append(
7452                                    f"""
7453                                        CASE
7454                                            WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('')
7455                                            THEN concat('{pz_prefix}Infos=', {pz_prefix}Infos{pzfields_sep}{profile})
7456                                            ELSE ''
7457                                        END
7458                                    """
7459                                )
7460
7461                        # Merge PZfields
7462                        sql_set_info_option = ""
7463                        sql_set_sep = ""
7464                        for sql_set in sql_set_info:
7465                            if sql_set_sep:
7466                                sql_set_info_option += f"""
7467                                    , concat('{sql_set_sep}', {sql_set})
7468                                """
7469                            else:
7470                                sql_set_info_option += f"""
7471                                    , {sql_set}
7472                                """
7473                            sql_set_sep = ";"
7474
7475                        sql_queries = []
7476                        for annotation in prioritizations_config[profile]:
7477
7478                            # skip special sections
7479                            if annotation.startswith("_"):
7480                                continue
7481
7482                            # For each criterions
7483                            for criterion in prioritizations_config[profile][
7484                                annotation
7485                            ]:
7486
7487                                # Criterion mode
7488                                criterion_mode = None
7489                                if np.any(
7490                                    np.isin(list(criterion.keys()), ["type", "value"])
7491                                ):
7492                                    criterion_mode = "operation"
7493                                elif np.any(
7494                                    np.isin(list(criterion.keys()), ["sql", "fields"])
7495                                ):
7496                                    criterion_mode = "sql"
7497                                log.debug(f"Criterion Mode: {criterion_mode}")
7498
7499                                # Criterion parameters
7500                                criterion_type = criterion.get("type", None)
7501                                criterion_value = criterion.get("value", None)
7502                                criterion_sql = criterion.get("sql", None)
7503                                criterion_fields = criterion.get("fields", None)
7504                                criterion_score = criterion.get("score", 0)
7505                                criterion_flag = criterion.get("flag", "PASS")
7506                                criterion_class = criterion.get("class", None)
7507                                criterion_flag_bool = criterion_flag == "PASS"
7508                                criterion_comment = (
7509                                    ", ".join(criterion.get("comment", []))
7510                                    .replace("'", "''")
7511                                    .replace(";", ",")
7512                                    .replace("\t", " ")
7513                                )
7514                                criterion_infos = (
7515                                    str(criterion)
7516                                    .replace("'", "''")
7517                                    .replace(";", ",")
7518                                    .replace("\t", " ")
7519                                )
7520
7521                                # SQL
7522                                if criterion_sql is not None and isinstance(
7523                                    criterion_sql, list
7524                                ):
7525                                    criterion_sql = " ".join(criterion_sql)
7526
7527                                # Fields and explode
7528                                if criterion_fields is None:
7529                                    criterion_fields = [annotation]
7530                                if not isinstance(criterion_fields, list):
7531                                    criterion_fields = str(criterion_fields).split(",")
7532
7533                                # Class
7534                                if criterion_class is not None and not isinstance(
7535                                    criterion_class, list
7536                                ):
7537                                    criterion_class = str(criterion_class).split(",")
7538
7539                                for annotation_field in criterion_fields:
7540
7541                                    # Explode specific annotation
7542                                    log.debug(
7543                                        f"Explode annotation '{annotation_field}'"
7544                                    )
7545                                    added_columns += self.explode_infos(
7546                                        prefix=explode_infos_prefix,
7547                                        fields=[annotation_field],
7548                                        table=table_variants,
7549                                    )
7550                                    extra_infos = self.get_extra_infos(
7551                                        table=table_variants
7552                                    )
7553
7554                                    # Check if annotation field is present
7555                                    if (
7556                                        f"{explode_infos_prefix}{annotation_field}"
7557                                        not in extra_infos
7558                                    ):
7559                                        msq_err = f"Annotation '{annotation_field}' not in data"
7560                                        log.error(msq_err)
7561                                        raise ValueError(msq_err)
7562                                    else:
7563                                        log.debug(
7564                                            f"Annotation '{annotation_field}' in data"
7565                                        )
7566
7567                                sql_set = []
7568                                sql_set_info = []
7569
7570                                # PZ fields set
7571
7572                                # PZScore
7573                                if (
7574                                    f"{pz_prefix}Score{pzfields_sep}{profile}"
7575                                    in list_of_pzfields
7576                                ):
7577                                    # if prioritization_score_mode == "HOWARD":
7578                                    #     sql_set.append(
7579                                    #         f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}"
7580                                    #     )
7581                                    # VaRank prioritization score mode
7582                                    if prioritization_score_mode == "VaRank":
7583                                        sql_set.append(
7584                                            f"{pz_prefix}Score{pzfields_sep}{profile} = CASE WHEN {criterion_score}>{pz_prefix}Score{pzfields_sep}{profile} THEN {criterion_score} END"
7585                                        )
7586                                    # default HOWARD prioritization score mode
7587                                    else:
7588                                        sql_set.append(
7589                                            f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}"
7590                                        )
7591
7592                                # PZFlag
7593                                if (
7594                                    f"{pz_prefix}Flag{pzfields_sep}{profile}"
7595                                    in list_of_pzfields
7596                                ):
7597                                    sql_set.append(
7598                                        f"{pz_prefix}Flag{pzfields_sep}{profile} = {pz_prefix}Flag{pzfields_sep}{profile} AND {criterion_flag_bool}"
7599                                    )
7600
7601                                # PZClass
7602                                if (
7603                                    f"{pz_prefix}Class{pzfields_sep}{profile}"
7604                                    in list_of_pzfields
7605                                    and criterion_class is not None
7606                                ):
7607                                    sql_set.append(
7608                                        f" {pz_prefix}Class{pzfields_sep}{profile} = list_concat(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), {criterion_class}) "
7609                                    )
7610
7611                                # PZComment
7612                                if (
7613                                    f"{pz_prefix}Comment{pzfields_sep}{profile}"
7614                                    in list_of_pzfields
7615                                ):
7616                                    sql_set.append(
7617                                        f"""
7618                                            {pz_prefix}Comment{pzfields_sep}{profile} = 
7619                                                concat(
7620                                                    {pz_prefix}Comment{pzfields_sep}{profile},
7621                                                    CASE 
7622                                                        WHEN {pz_prefix}Comment{pzfields_sep}{profile}!=''
7623                                                        THEN ', '
7624                                                        ELSE ''
7625                                                    END,
7626                                                    '{criterion_comment}'
7627                                                )
7628                                        """
7629                                    )
7630
7631                                # PZInfos
7632                                if (
7633                                    f"{pz_prefix}Infos{pzfields_sep}{profile}"
7634                                    in list_of_pzfields
7635                                ):
7636                                    sql_set.append(
7637                                        f"""
7638                                            {pz_prefix}Infos{pzfields_sep}{profile} = 
7639                                                concat(
7640                                                    {pz_prefix}Infos{pzfields_sep}{profile},
7641                                                    '{criterion_infos}'
7642                                                )
7643                                        """
7644                                    )
7645                                sql_set_option = ",".join(sql_set)
7646
7647                                # Criterion and comparison
7648                                if sql_set_option:
7649
7650                                    if criterion_mode in ["operation"]:
7651
7652                                        try:
7653                                            float(criterion_value)
7654                                            sql_update = f"""
7655                                                UPDATE {table_variants}
7656                                                SET {sql_set_option}
7657                                                WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.')
7658                                                AND CAST("{explode_infos_prefix}{annotation}" AS FLOAT){comparison_map[criterion_type]}{criterion_value}
7659                                            """
7660                                        except:
7661                                            contains_option = ""
7662                                            if criterion_type == "contains":
7663                                                contains_option = ".*"
7664                                            sql_update = f"""
7665                                                UPDATE {table_variants}
7666                                                SET {sql_set_option}
7667                                                WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}'
7668                                            """
7669                                        sql_queries.append(sql_update)
7670
7671                                    elif criterion_mode in ["sql"]:
7672
7673                                        sql_update = f"""
7674                                            UPDATE {table_variants}
7675                                            SET {sql_set_option}
7676                                            WHERE {criterion_sql}
7677                                        """
7678                                        sql_queries.append(sql_update)
7679
7680                                    else:
7681                                        msg_err = f"Prioritization criterion mode failed (either 'operation' or 'sql')"
7682                                        log.error(msg_err)
7683                                        raise ValueError(msg_err)
7684
7685                                else:
7686                                    log.warning(
7687                                        f"NO SQL SET option for '{annotation}' - '{criterion}'"
7688                                    )
7689
7690                        # PZTags
7691                        if (
7692                            f"{pz_prefix}Tags{pzfields_sep}{profile}"
7693                            in list_of_pzfields
7694                        ):
7695
7696                            # Create PZFalgs value
7697                            pztags_value = ""
7698                            pztags_sep_default = ","
7699                            pztags_sep = ""
7700                            for pzfield in pzfields:
7701                                if pzfield not in [f"{pz_prefix}Tags"]:
7702                                    if (
7703                                        f"{pzfield}{pzfields_sep}{profile}"
7704                                        in list_of_pzfields
7705                                    ):
7706                                        if pzfield in [f"{pz_prefix}Flag"]:
7707                                            pztags_value += f"""{pztags_sep}{pzfield}#', 
7708                                                CASE WHEN {pz_prefix}Flag{pzfields_sep}{profile}
7709                                                    THEN 'PASS'
7710                                                    ELSE 'FILTERED'
7711                                                END, '"""
7712                                        elif pzfield in [f"{pz_prefix}Class"]:
7713                                            pztags_value += f"""{pztags_sep}{pzfield}#', 
7714                                                CASE WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
7715                                                    THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
7716                                                    ELSE '.'
7717                                                END, '"""
7718                                        else:
7719                                            pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '"
7720                                        pztags_sep = pztags_sep_default
7721
7722                            # Add Query update for PZFlags
7723                            sql_update_pztags = f"""
7724                                UPDATE {table_variants}
7725                                SET INFO = concat(
7726                                        INFO,
7727                                        CASE WHEN INFO NOT in ('','.')
7728                                                THEN ';'
7729                                                ELSE ''
7730                                        END,
7731                                        '{pz_prefix}Tags{pzfields_sep}{profile}={pztags_value}'
7732                                    )
7733                                """
7734                            sql_queries.append(sql_update_pztags)
7735
7736                            # Add Query update for PZFlags for default
7737                            if profile == default_profile:
7738                                sql_update_pztags_default = f"""
7739                                UPDATE {table_variants}
7740                                SET INFO = concat(
7741                                        INFO,
7742                                        ';',
7743                                        '{pz_prefix}Tags={pztags_value}'
7744                                    )
7745                                """
7746                                sql_queries.append(sql_update_pztags_default)
7747
7748                        log.info(f"""Profile '{profile}' - Prioritization... """)
7749
7750                        if sql_queries:
7751
7752                            for sql_query in sql_queries:
7753                                log.debug(
7754                                    f"""Profile '{profile}' - Prioritization query: {sql_query}... """
7755                                )
7756                                self.conn.execute(sql_query)
7757
7758                        log.info(f"""Profile '{profile}' - Update... """)
7759                        sql_query_update = f"""
7760                            UPDATE {table_variants}
7761                            SET INFO =  
7762                                concat(
7763                                    CASE
7764                                        WHEN INFO NOT IN ('','.')
7765                                        THEN concat(INFO, ';')
7766                                        ELSE ''
7767                                    END
7768                                    {sql_set_info_option}
7769                                )
7770                        """
7771                        self.conn.execute(sql_query_update)
7772
7773        else:
7774
7775            log.warning(f"No profiles in parameters")
7776
7777        # Remove added columns
7778        for added_column in added_columns:
7779            self.drop_column(column=added_column)
7780
7781        # Explode INFOS fields into table fields
7782        if self.get_explode_infos():
7783            self.explode_infos(
7784                prefix=self.get_explode_infos_prefix(),
7785                fields=self.get_explode_infos_fields(),
7786                force=True,
7787            )
7788
7789        return True

The prioritization function in Python processes VCF files, adds new INFO fields, and prioritizes variants based on configured profiles and criteria.

Parameters
  • table: The table parameter in the prioritization function is used to specify the name of the table (presumably a VCF file) on which the prioritization operation will be performed. If a table name is provided, the method will prioritize the variants in that specific table
  • pz_prefix: The pz_prefix parameter is used to specify a prefix that will be added to certain INFO fields in a VCF file during the prioritization process. If this parameter is not provided, the code will use a default prefix value of "PZ"
  • pz_param: The pz_param parameter in the prioritization method is used to pass additional parameters specific to the prioritization process. These parameters can include settings related to prioritization profiles, fields, scoring modes, flags, comments, and other configurations needed for the prioritization of variants in a V
Returns

A boolean value (True) is being returned from the prioritization function.

def annotation_hgvs(self, threads: int = None) -> None:
7795    def annotation_hgvs(self, threads: int = None) -> None:
7796        """
7797        The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic
7798        coordinates and alleles.
7799
7800        :param threads: The `threads` parameter is an optional integer that specifies the number of
7801        threads to use for parallel processing. If no value is provided, it will default to the number
7802        of threads obtained from the `get_threads()` method
7803        :type threads: int
7804        """
7805
7806        # Function for each partition of the Dask Dataframe
7807        def partition_function(partition):
7808            """
7809            The function `partition_function` applies the `annotation_hgvs_partition` function to
7810            each row of a DataFrame called `partition`.
7811
7812            :param partition: The parameter "partition" is a pandas DataFrame that contains the data
7813            to be processed
7814            :return: the result of applying the "annotation_hgvs_partition" function to each row of
7815            the "partition" dataframe along the axis 1.
7816            """
7817            return partition.apply(annotation_hgvs_partition, axis=1)
7818
7819        def annotation_hgvs_partition(row) -> str:
7820            """
7821            The function `annotation_hgvs_partition` takes in a row of data and returns a string
7822            containing a list of HGVS names associated with the given genomic coordinates and alleles.
7823
7824            :param row: A dictionary-like object that contains the values for the following keys:
7825            :return: a string that contains the HGVS names associated with the given row of data.
7826            """
7827
7828            chr = row["CHROM"]
7829            pos = row["POS"]
7830            ref = row["REF"]
7831            alt = row["ALT"]
7832
7833            # Find list of associated transcripts
7834            transcripts_list = list(
7835                polars_conn.execute(
7836                    f"""
7837                SELECT transcript
7838                FROM refseq_df
7839                WHERE CHROM='{chr}'
7840                AND POS={pos}
7841            """
7842                )["transcript"]
7843            )
7844
7845            # Full HGVS annotation in list
7846            hgvs_full_list = []
7847
7848            for transcript_name in transcripts_list:
7849
7850                # Transcript
7851                transcript = get_transcript(
7852                    transcripts=transcripts, transcript_name=transcript_name
7853                )
7854                # Exon
7855                if use_exon:
7856                    exon = transcript.find_exon_number(pos)
7857                else:
7858                    exon = None
7859                # Protein
7860                transcript_protein = None
7861                if use_protein or add_protein or full_format:
7862                    transcripts_protein = list(
7863                        polars_conn.execute(
7864                            f"""
7865                        SELECT protein
7866                        FROM refseqlink_df
7867                        WHERE transcript='{transcript_name}'
7868                        LIMIT 1
7869                    """
7870                        )["protein"]
7871                    )
7872                    if len(transcripts_protein):
7873                        transcript_protein = transcripts_protein[0]
7874
7875                # HGVS name
7876                hgvs_name = format_hgvs_name(
7877                    chr,
7878                    pos,
7879                    ref,
7880                    alt,
7881                    genome=genome,
7882                    transcript=transcript,
7883                    transcript_protein=transcript_protein,
7884                    exon=exon,
7885                    use_gene=use_gene,
7886                    use_protein=use_protein,
7887                    full_format=full_format,
7888                    use_version=use_version,
7889                    codon_type=codon_type,
7890                )
7891                hgvs_full_list.append(hgvs_name)
7892                if add_protein and not use_protein and not full_format:
7893                    hgvs_name = format_hgvs_name(
7894                        chr,
7895                        pos,
7896                        ref,
7897                        alt,
7898                        genome=genome,
7899                        transcript=transcript,
7900                        transcript_protein=transcript_protein,
7901                        exon=exon,
7902                        use_gene=use_gene,
7903                        use_protein=True,
7904                        full_format=False,
7905                        use_version=use_version,
7906                        codon_type=codon_type,
7907                    )
7908                    hgvs_full_list.append(hgvs_name)
7909
7910            # Create liste of HGVS annotations
7911            hgvs_full = ",".join(hgvs_full_list)
7912
7913            return hgvs_full
7914
7915        # Polars connexion
7916        polars_conn = pl.SQLContext(register_globals=True, eager=True)
7917
7918        # Config
7919        config = self.get_config()
7920
7921        # Databases
7922        # Genome
7923        databases_genomes_folders = (
7924            config.get("folders", {})
7925            .get("databases", {})
7926            .get("genomes", DEFAULT_GENOME_FOLDER)
7927        )
7928        databases_genome = (
7929            config.get("folders", {}).get("databases", {}).get("genomes", "")
7930        )
7931        # refseq database folder
7932        databases_refseq_folders = (
7933            config.get("folders", {})
7934            .get("databases", {})
7935            .get("refseq", DEFAULT_REFSEQ_FOLDER)
7936        )
7937        # refseq
7938        databases_refseq = config.get("databases", {}).get("refSeq", None)
7939        # refSeqLink
7940        databases_refseqlink = config.get("databases", {}).get("refSeqLink", None)
7941
7942        # Param
7943        param = self.get_param()
7944
7945        # Quick HGVS
7946        if "hgvs_options" in param and param.get("hgvs_options", ""):
7947            log.info(f"Quick HGVS Annotation:")
7948            if not param.get("hgvs", None):
7949                param["hgvs"] = {}
7950            for option in param.get("hgvs_options", "").split(","):
7951                option_var_val = option.split("=")
7952                option_var = option_var_val[0]
7953                if len(option_var_val) > 1:
7954                    option_val = option_var_val[1]
7955                else:
7956                    option_val = "True"
7957                if option_val.upper() in ["TRUE"]:
7958                    option_val = True
7959                elif option_val.upper() in ["FALSE"]:
7960                    option_val = False
7961                log.info(f"   {option_var}={option_val}")
7962                param["hgvs"][option_var] = option_val
7963
7964        # Check if HGVS annotation enabled
7965        if "hgvs" in param:
7966            log.info(f"HGVS Annotation... ")
7967            for hgvs_option in param.get("hgvs", {}):
7968                log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}")
7969        else:
7970            return
7971
7972        # HGVS Param
7973        param_hgvs = param.get("hgvs", {})
7974        use_exon = param_hgvs.get("use_exon", False)
7975        use_gene = param_hgvs.get("use_gene", False)
7976        use_protein = param_hgvs.get("use_protein", False)
7977        add_protein = param_hgvs.get("add_protein", False)
7978        full_format = param_hgvs.get("full_format", False)
7979        use_version = param_hgvs.get("use_version", False)
7980        codon_type = param_hgvs.get("codon_type", "3")
7981
7982        # refSseq refSeqLink
7983        databases_refseq = param_hgvs.get("refseq", databases_refseq)
7984        databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink)
7985
7986        # Assembly
7987        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
7988
7989        # Genome
7990        genome_file = None
7991        if find_genome(databases_genome):
7992            genome_file = find_genome(databases_genome)
7993        else:
7994            genome_file = find_genome(
7995                genome_path=databases_genomes_folders, assembly=assembly
7996            )
7997        log.debug("Genome: " + str(genome_file))
7998
7999        # refSseq
8000        refseq_file = find_file_prefix(
8001            input_file=databases_refseq,
8002            prefix="ncbiRefSeq",
8003            folder=databases_refseq_folders,
8004            assembly=assembly,
8005        )
8006        log.debug("refSeq: " + str(refseq_file))
8007
8008        # refSeqLink
8009        refseqlink_file = find_file_prefix(
8010            input_file=databases_refseqlink,
8011            prefix="ncbiRefSeqLink",
8012            folder=databases_refseq_folders,
8013            assembly=assembly,
8014        )
8015        log.debug("refSeqLink: " + str(refseqlink_file))
8016
8017        # Threads
8018        if not threads:
8019            threads = self.get_threads()
8020        log.debug("Threads: " + str(threads))
8021
8022        # Variables
8023        table_variants = self.get_table_variants(clause="update")
8024
8025        # Get variants SNV and InDel only
8026        query_variants = f"""
8027            SELECT "#CHROM" AS CHROM, POS, REF, ALT
8028            FROM {table_variants}
8029            WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$'
8030            """
8031        df_variants = self.get_query_to_df(query_variants)
8032
8033        # Added columns
8034        added_columns = []
8035
8036        # Add hgvs column in variants table
8037        hgvs_column_name = "hgvs_" + str(random.randrange(1000))
8038        added_column = self.add_column(
8039            table_variants, hgvs_column_name, "STRING", default_value=None
8040        )
8041        added_columns.append(added_column)
8042
8043        log.debug(f"refSeq loading...")
8044        # refSeq in duckDB
8045        refseq_table = get_refseq_table(
8046            conn=self.conn, refseq_table="refseq", refseq_file=refseq_file
8047        )
8048        # Loading all refSeq in Dataframe
8049        refseq_query = f"""
8050            SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript
8051            FROM {refseq_table}
8052            JOIN df_variants ON (
8053                {refseq_table}.chrom = df_variants.CHROM
8054                AND {refseq_table}.txStart<=df_variants.POS
8055                AND {refseq_table}.txEnd>=df_variants.POS
8056            )
8057        """
8058        refseq_df = self.conn.query(refseq_query).pl()
8059
8060        if refseqlink_file:
8061            log.debug(f"refSeqLink loading...")
8062            # refSeqLink in duckDB
8063            refseqlink_table = get_refseq_table(
8064                conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file
8065            )
8066            # Loading all refSeqLink in Dataframe
8067            protacc_column = "protAcc_with_ver"
8068            mrnaacc_column = "mrnaAcc_with_ver"
8069            refseqlink_query = f"""
8070                SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript
8071                FROM {refseqlink_table} 
8072                JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver)
8073                WHERE protAcc_without_ver IS NOT NULL
8074            """
8075            # Polars Dataframe
8076            refseqlink_df = self.conn.query(f"{refseqlink_query}").pl()
8077
8078        # Read RefSeq transcripts into a python dict/model.
8079        log.debug(f"Transcripts loading...")
8080        with tempfile.TemporaryDirectory() as tmpdir:
8081            transcripts_query = f"""
8082                COPY (
8083                    SELECT {refseq_table}.*
8084                    FROM {refseq_table}
8085                    JOIN df_variants ON (
8086                        {refseq_table}.chrom=df_variants.CHROM
8087                        AND {refseq_table}.txStart<=df_variants.POS
8088                        AND {refseq_table}.txEnd>=df_variants.POS
8089                    )
8090                )
8091                TO '{tmpdir}/transcript.tsv' (DELIMITER '\t');
8092            """
8093            self.conn.query(transcripts_query)
8094            with open(f"{tmpdir}/transcript.tsv") as infile:
8095                transcripts = read_transcripts(infile)
8096
8097        # Polars connexion
8098        polars_conn = pl.SQLContext(register_globals=True, eager=True)
8099
8100        log.debug("Genome loading...")
8101        # Read genome sequence using pyfaidx.
8102        genome = Fasta(genome_file)
8103
8104        log.debug("Start annotation HGVS...")
8105
8106        # Create
8107        # a Dask Dataframe from Pandas dataframe with partition as number of threads
8108        ddf = dd.from_pandas(df_variants, npartitions=threads)
8109
8110        # Use dask.dataframe.apply() to apply function on each partition
8111        ddf[hgvs_column_name] = ddf.map_partitions(partition_function)
8112
8113        # Convert Dask DataFrame to Pandas Dataframe
8114        df = ddf.compute()
8115
8116        # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???)
8117        with tempfile.TemporaryDirectory() as tmpdir:
8118            df_parquet = os.path.join(tmpdir, "df.parquet")
8119            df.to_parquet(df_parquet)
8120
8121            # Update hgvs column
8122            update_variant_query = f"""
8123                UPDATE {table_variants}
8124                SET "{hgvs_column_name}"=df."{hgvs_column_name}"
8125                FROM read_parquet('{df_parquet}') as df
8126                WHERE variants."#CHROM" = df.CHROM
8127                AND variants.POS = df.POS
8128                AND variants.REF = df.REF
8129                AND variants.ALT = df.ALT
8130                AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL
8131                """
8132            self.execute_query(update_variant_query)
8133
8134        # Update INFO column
8135        sql_query_update = f"""
8136            UPDATE {table_variants}
8137            SET INFO = 
8138                concat(
8139                    CASE 
8140                        WHEN INFO NOT IN ('','.')
8141                        THEN concat(INFO, ';')
8142                        ELSE ''
8143                    END,
8144                    'hgvs=',
8145                    {hgvs_column_name}
8146                )
8147            WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL
8148            """
8149        self.execute_query(sql_query_update)
8150
8151        # Add header
8152        HGVS_INFOS = {
8153            "hgvs": {
8154                "ID": "hgvs",
8155                "Number": ".",
8156                "Type": "String",
8157                "Description": f"HGVS annotatation with HOWARD",
8158            }
8159        }
8160
8161        for field in HGVS_INFOS:
8162            field_ID = HGVS_INFOS[field]["ID"]
8163            field_description = HGVS_INFOS[field]["Description"]
8164            self.get_header().infos[field_ID] = vcf.parser._Info(
8165                field_ID,
8166                HGVS_INFOS[field]["Number"],
8167                HGVS_INFOS[field]["Type"],
8168                field_description,
8169                "unknown",
8170                "unknown",
8171                code_type_map[HGVS_INFOS[field]["Type"]],
8172            )
8173
8174        # Remove added columns
8175        for added_column in added_columns:
8176            self.drop_column(column=added_column)

The annotation_hgvs function performs HGVS annotation on a set of variants using genomic coordinates and alleles.

Parameters
  • threads: The threads parameter is an optional integer that specifies the number of threads to use for parallel processing. If no value is provided, it will default to the number of threads obtained from the get_threads() method
def get_operations_help( self, operations_config_dict: dict = {}, operations_config_file: str = None) -> list:
8182    def get_operations_help(
8183        self, operations_config_dict: dict = {}, operations_config_file: str = None
8184    ) -> list:
8185
8186        # Init
8187        operations_help = []
8188
8189        # operations
8190        operations = self.get_config_json(
8191            name="calculations",
8192            config_dict=operations_config_dict,
8193            config_file=operations_config_file,
8194        )
8195        for op in operations:
8196            op_name = operations[op].get("name", op).upper()
8197            op_description = operations[op].get("description", op_name)
8198            op_available = operations[op].get("available", False)
8199            if op_available:
8200                operations_help.append(f"   {op_name}: {op_description}")
8201
8202        # Sort operations
8203        operations_help.sort()
8204
8205        # insert header
8206        operations_help.insert(0, "Available calculation operations:")
8207
8208        # Return
8209        return operations_help
def calculation( self, operations: dict = {}, operations_config_dict: dict = {}, operations_config_file: str = None) -> None:
8211    def calculation(
8212        self,
8213        operations: dict = {},
8214        operations_config_dict: dict = {},
8215        operations_config_file: str = None,
8216    ) -> None:
8217        """
8218        It takes a list of operations, and for each operation, it checks if it's a python or sql
8219        operation, and then calls the appropriate function
8220
8221        param json example:
8222            "calculation": {
8223                "NOMEN": {
8224                    "options": {
8225                        "hgvs_field": "hgvs"
8226                    },
8227                "middle" : null
8228            }
8229        """
8230
8231        # Param
8232        param = self.get_param()
8233
8234        # operations config
8235        operations_config = self.get_config_json(
8236            name="calculations",
8237            config_dict=operations_config_dict,
8238            config_file=operations_config_file,
8239        )
8240
8241        # Upper keys
8242        operations_config = {k.upper(): v for k, v in operations_config.items()}
8243
8244        # Calculations
8245
8246        # Operations from param
8247        operations = param.get("calculation", {}).get("calculations", operations)
8248
8249        # Quick calculation - add
8250        if param.get("calculations", None):
8251
8252            # List of operations
8253            calculations_list = [
8254                value.strip() for value in param.get("calculations", "").split(",")
8255            ]
8256
8257            # Log
8258            log.info(f"Quick Calculations:")
8259            for calculation_key in calculations_list:
8260                log.info(f"   {calculation_key}")
8261
8262            # Create tmp operations (to keep operation order)
8263            operations_tmp = {}
8264            for calculation_operation in calculations_list:
8265                if calculation_operation.upper() not in operations_tmp:
8266                    log.debug(
8267                        f"{calculation_operation}.upper() not in {operations_tmp}"
8268                    )
8269                    operations_tmp[calculation_operation.upper()] = {}
8270                    add_value_into_dict(
8271                        dict_tree=operations_tmp,
8272                        sections=[
8273                            calculation_operation.upper(),
8274                        ],
8275                        value=operations.get(calculation_operation.upper(), {}),
8276                    )
8277            # Add operations already in param
8278            for calculation_operation in operations:
8279                if calculation_operation not in operations_tmp:
8280                    operations_tmp[calculation_operation] = operations.get(
8281                        calculation_operation, {}
8282                    )
8283
8284            # Update operations in param
8285            operations = operations_tmp
8286
8287        # Operations for calculation
8288        if not operations:
8289            operations = param.get("calculation", {}).get("calculations", {})
8290
8291        if operations:
8292            log.info(f"Calculations...")
8293
8294        # For each operations
8295        for operation_name in operations:
8296            operation_name = operation_name.upper()
8297            if operation_name not in [""]:
8298                if operation_name in operations_config:
8299                    log.info(f"Calculation '{operation_name}'")
8300                    operation = operations_config[operation_name]
8301                    operation_type = operation.get("type", "sql")
8302                    if operation_type == "python":
8303                        self.calculation_process_function(
8304                            operation=operation, operation_name=operation_name
8305                        )
8306                    elif operation_type == "sql":
8307                        self.calculation_process_sql(
8308                            operation=operation, operation_name=operation_name
8309                        )
8310                    else:
8311                        log.error(
8312                            f"Operations config: Type '{operation_type}' NOT available"
8313                        )
8314                        raise ValueError(
8315                            f"Operations config: Type '{operation_type}' NOT available"
8316                        )
8317                else:
8318                    log.error(
8319                        f"Operations config: Calculation '{operation_name}' NOT available"
8320                    )
8321                    raise ValueError(
8322                        f"Operations config: Calculation '{operation_name}' NOT available"
8323                    )
8324
8325        # Explode INFOS fields into table fields
8326        if self.get_explode_infos():
8327            self.explode_infos(
8328                prefix=self.get_explode_infos_prefix(),
8329                fields=self.get_explode_infos_fields(),
8330                force=True,
8331            )

It takes a list of operations, and for each operation, it checks if it's a python or sql operation, and then calls the appropriate function

param json example: "calculation": { "NOMEN": { "options": { "hgvs_field": "hgvs" }, "middle" : null }

def calculation_process_sql(self, operation: dict, operation_name: str = 'unknown') -> None:
8333    def calculation_process_sql(
8334        self, operation: dict, operation_name: str = "unknown"
8335    ) -> None:
8336        """
8337        The `calculation_process_sql` function takes in a mathematical operation as a string and
8338        performs the operation, updating the specified table with the result.
8339
8340        :param operation: The `operation` parameter is a dictionary that contains information about the
8341        mathematical operation to be performed. It includes the following keys:
8342        :type operation: dict
8343        :param operation_name: The `operation_name` parameter is a string that represents the name of
8344        the mathematical operation being performed. It is used for logging and error handling purposes,
8345        defaults to unknown
8346        :type operation_name: str (optional)
8347        """
8348
8349        # Operation infos
8350        operation_name = operation.get("name", "unknown")
8351        log.debug(f"process sql {operation_name}")
8352        output_column_name = operation.get("output_column_name", operation_name)
8353        output_column_type = operation.get("output_column_type", "String")
8354        prefix = operation.get("explode_infos_prefix", "")
8355        output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR")
8356        output_column_description = operation.get(
8357            "output_column_description", f"{operation_name} operation"
8358        )
8359        operation_query = operation.get("operation_query", None)
8360        if isinstance(operation_query, list):
8361            operation_query = " ".join(operation_query)
8362        operation_info_fields = operation.get("info_fields", [])
8363        operation_info_fields_check = operation.get("info_fields_check", False)
8364        operation_info = operation.get("operation_info", True)
8365        operation_table = operation.get(
8366            "table", self.get_table_variants(clause="alter")
8367        )
8368
8369        # table variants
8370        if operation_table:
8371            table_variants = operation_table
8372        else:
8373            table_variants = self.get_table_variants(clause="alter")
8374
8375        if operation_query:
8376
8377            # Info fields check
8378            operation_info_fields_check_result = True
8379            if operation_info_fields_check:
8380                header_infos = self.get_header().infos
8381                for info_field in operation_info_fields:
8382                    operation_info_fields_check_result = (
8383                        operation_info_fields_check_result
8384                        and info_field in header_infos
8385                    )
8386
8387            # If info fields available
8388            if operation_info_fields_check_result:
8389
8390                # Added_columns
8391                added_columns = []
8392
8393                # Create VCF header field
8394                vcf_reader = self.get_header()
8395                vcf_reader.infos[output_column_name] = vcf.parser._Info(
8396                    output_column_name,
8397                    ".",
8398                    output_column_type,
8399                    output_column_description,
8400                    "howard calculation",
8401                    "0",
8402                    self.code_type_map.get(output_column_type),
8403                )
8404
8405                # Explode infos if needed
8406                log.debug(f"calculation_process_sql prefix {prefix}")
8407                added_columns += self.explode_infos(
8408                    prefix=prefix,
8409                    fields=[output_column_name] + operation_info_fields,
8410                    force=False,
8411                    table=table_variants,
8412                )
8413
8414                # Create column
8415                added_column = self.add_column(
8416                    table_name=table_variants,
8417                    column_name=prefix + output_column_name,
8418                    column_type=output_column_type_sql,
8419                    default_value="null",
8420                )
8421                added_columns.append(added_column)
8422
8423                # Operation calculation
8424                try:
8425
8426                    # Query to update calculation column
8427                    sql_update = f"""
8428                        UPDATE {table_variants}
8429                        SET "{prefix}{output_column_name}" = ({operation_query})
8430                    """
8431                    self.conn.execute(sql_update)
8432
8433                    # Add to INFO
8434                    if operation_info:
8435                        sql_update_info = f"""
8436                            UPDATE {table_variants}
8437                            SET "INFO" =
8438                                concat(
8439                                    CASE
8440                                        WHEN "INFO" IS NOT NULL
8441                                        THEN concat("INFO", ';')
8442                                        ELSE ''
8443                                    END,
8444                                    '{output_column_name}=',
8445                                    "{prefix}{output_column_name}"
8446                                )
8447                            WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('')
8448                        """
8449                        self.conn.execute(sql_update_info)
8450
8451                except:
8452                    log.error(
8453                        f"Operations config: Calculation '{operation_name}' query failed"
8454                    )
8455                    raise ValueError(
8456                        f"Operations config: Calculation '{operation_name}' query failed"
8457                    )
8458
8459                # Remove added columns
8460                for added_column in added_columns:
8461                    log.debug(f"added_column: {added_column}")
8462                    self.drop_column(column=added_column)
8463
8464            else:
8465                log.error(
8466                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
8467                )
8468                raise ValueError(
8469                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
8470                )
8471
8472        else:
8473            log.error(
8474                f"Operations config: Calculation '{operation_name}' query NOT defined"
8475            )
8476            raise ValueError(
8477                f"Operations config: Calculation '{operation_name}' query NOT defined"
8478            )

The calculation_process_sql function takes in a mathematical operation as a string and performs the operation, updating the specified table with the result.

Parameters
  • operation: The operation parameter is a dictionary that contains information about the mathematical operation to be performed. It includes the following keys:
  • operation_name: The operation_name parameter is a string that represents the name of the mathematical operation being performed. It is used for logging and error handling purposes, defaults to unknown
def calculation_process_function(self, operation: dict, operation_name: str = 'unknown') -> None:
8480    def calculation_process_function(
8481        self, operation: dict, operation_name: str = "unknown"
8482    ) -> None:
8483        """
8484        The `calculation_process_function` takes in an operation dictionary and performs the specified
8485        function with the given parameters.
8486
8487        :param operation: The `operation` parameter is a dictionary that contains information about the
8488        operation to be performed. It has the following keys:
8489        :type operation: dict
8490        :param operation_name: The `operation_name` parameter is a string that represents the name of
8491        the operation being performed. It is used for logging purposes, defaults to unknown
8492        :type operation_name: str (optional)
8493        """
8494
8495        operation_name = operation["name"]
8496        log.debug(f"process sql {operation_name}")
8497        function_name = operation["function_name"]
8498        function_params = operation["function_params"]
8499        getattr(self, function_name)(*function_params)

The calculation_process_function takes in an operation dictionary and performs the specified function with the given parameters.

Parameters
  • operation: The operation parameter is a dictionary that contains information about the operation to be performed. It has the following keys:
  • operation_name: The operation_name parameter is a string that represents the name of the operation being performed. It is used for logging purposes, defaults to unknown
def calculation_variant_id(self) -> None:
8501    def calculation_variant_id(self) -> None:
8502        """
8503        The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and
8504        updates the INFO field of a variants table with the variant ID.
8505        """
8506
8507        # variant_id annotation field
8508        variant_id_tag = self.get_variant_id_column()
8509        added_columns = [variant_id_tag]
8510
8511        # variant_id hgvs tags"
8512        vcf_infos_tags = {
8513            variant_id_tag: "howard variant ID annotation",
8514        }
8515
8516        # Variants table
8517        table_variants = self.get_table_variants()
8518
8519        # Header
8520        vcf_reader = self.get_header()
8521
8522        # Add variant_id to header
8523        vcf_reader.infos[variant_id_tag] = vcf.parser._Info(
8524            variant_id_tag,
8525            ".",
8526            "String",
8527            vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"),
8528            "howard calculation",
8529            "0",
8530            self.code_type_map.get("String"),
8531        )
8532
8533        # Update
8534        sql_update = f"""
8535            UPDATE {table_variants}
8536            SET "INFO" = 
8537                concat(
8538                    CASE
8539                        WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8540                        THEN ''
8541                        ELSE concat("INFO", ';')
8542                    END,
8543                    '{variant_id_tag}=',
8544                    "{variant_id_tag}"
8545                )
8546        """
8547        self.conn.execute(sql_update)
8548
8549        # Remove added columns
8550        for added_column in added_columns:
8551            self.drop_column(column=added_column)

The function calculation_variant_id adds a variant ID annotation to a VCF file header and updates the INFO field of a variants table with the variant ID.

def calculation_extract_snpeff_hgvs( self, snpeff_hgvs: str = 'snpeff_hgvs', snpeff_field: str = 'ANN') -> None:
8553    def calculation_extract_snpeff_hgvs(
8554        self,
8555        snpeff_hgvs: str = "snpeff_hgvs",
8556        snpeff_field: str = "ANN",
8557    ) -> None:
8558        """
8559        The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff
8560        annotation field in a VCF file and adds them as a new column in the variants table.
8561
8562        :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs`
8563        function is used to specify the name of the column that will store the HGVS nomenclatures
8564        extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to
8565        snpeff_hgvs
8566        :type snpeff_hgvs: str (optional)
8567        :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs`
8568        function represents the field in the VCF file that contains SnpEff annotations. This field is
8569        used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults
8570        to ANN
8571        :type snpeff_field: str (optional)
8572        """
8573
8574        # Snpeff hgvs tags
8575        vcf_infos_tags = {
8576            snpeff_hgvs: "HGVS nomenclatures from snpEff annotation",
8577        }
8578
8579        # Prefix
8580        prefix = self.get_explode_infos_prefix()
8581        if prefix:
8582            prefix = "INFO/"
8583
8584        # snpEff fields
8585        speff_ann_infos = prefix + snpeff_field
8586        speff_hgvs_infos = prefix + snpeff_hgvs
8587
8588        # Variants table
8589        table_variants = self.get_table_variants()
8590
8591        # Header
8592        vcf_reader = self.get_header()
8593
8594        # Add columns
8595        added_columns = []
8596
8597        # Explode HGVS field in column
8598        added_columns += self.explode_infos(fields=[snpeff_field])
8599
8600        if snpeff_field in vcf_reader.infos:
8601
8602            log.debug(vcf_reader.infos[snpeff_field])
8603
8604            # Extract ANN header
8605            ann_description = vcf_reader.infos[snpeff_field].desc
8606            pattern = r"'(.+?)'"
8607            match = re.search(pattern, ann_description)
8608            if match:
8609                ann_header_match = match.group(1).split(" | ")
8610                ann_header_desc = {}
8611                for i in range(len(ann_header_match)):
8612                    ann_header_info = "".join(
8613                        char for char in ann_header_match[i] if char.isalnum()
8614                    )
8615                    ann_header_desc[ann_header_info] = ann_header_match[i]
8616                if not ann_header_desc:
8617                    raise ValueError("Invalid header description format")
8618            else:
8619                raise ValueError("Invalid header description format")
8620
8621            # Create variant id
8622            variant_id_column = self.get_variant_id_column()
8623            added_columns += [variant_id_column]
8624
8625            # Create dataframe
8626            dataframe_snpeff_hgvs = self.get_query_to_df(
8627                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
8628            )
8629
8630            # Create main NOMEN column
8631            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
8632                speff_ann_infos
8633            ].apply(
8634                lambda x: extract_snpeff_hgvs(
8635                    str(x), header=list(ann_header_desc.values())
8636                )
8637            )
8638
8639            # Add snpeff_hgvs to header
8640            vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info(
8641                snpeff_hgvs,
8642                ".",
8643                "String",
8644                vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"),
8645                "howard calculation",
8646                "0",
8647                self.code_type_map.get("String"),
8648            )
8649
8650            # Update
8651            sql_update = f"""
8652                UPDATE variants
8653                SET "INFO" = 
8654                    concat(
8655                        CASE
8656                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8657                            THEN ''
8658                            ELSE concat("INFO", ';')
8659                        END,
8660                        CASE 
8661                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
8662                            AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
8663                            THEN concat(
8664                                    '{snpeff_hgvs}=',
8665                                    dataframe_snpeff_hgvs."{speff_hgvs_infos}"
8666                                )
8667                            ELSE ''
8668                        END
8669                    )
8670                FROM dataframe_snpeff_hgvs
8671                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
8672
8673            """
8674            self.conn.execute(sql_update)
8675
8676            # Delete dataframe
8677            del dataframe_snpeff_hgvs
8678            gc.collect()
8679
8680        else:
8681
8682            log.warning(
8683                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
8684            )
8685
8686        # Remove added columns
8687        for added_column in added_columns:
8688            self.drop_column(column=added_column)

The function calculation_extract_snpeff_hgvs extracts HGVS nomenclatures from the SnpEff annotation field in a VCF file and adds them as a new column in the variants table.

Parameters
  • snpeff_hgvs: The snpeff_hgvs parameter in the calculation_extract_snpeff_hgvs function is used to specify the name of the column that will store the HGVS nomenclatures extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to snpeff_hgvs
  • snpeff_field: The snpeff_field parameter in the calculation_extract_snpeff_hgvs function represents the field in the VCF file that contains SnpEff annotations. This field is used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults to ANN
def calculation_snpeff_ann_explode( self, uniquify: bool = True, output_format: str = 'fields', output_prefix: str = 'snpeff_', snpeff_field: str = 'ANN') -> None:
8690    def calculation_snpeff_ann_explode(
8691        self,
8692        uniquify: bool = True,
8693        output_format: str = "fields",
8694        output_prefix: str = "snpeff_",
8695        snpeff_field: str = "ANN",
8696    ) -> None:
8697        """
8698        The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by
8699        exploding the HGVS field and updating variant information accordingly.
8700
8701        :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a
8702        boolean flag that determines whether the output should be uniquified or not. When set to `True`,
8703        it indicates that the output should be unique, meaning that duplicate entries should be removed,
8704        defaults to True
8705        :type uniquify: bool (optional)
8706        :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode`
8707        function specifies the format in which the output annotations will be generated. It has a
8708        default value of "fields". You can also set it to "JSON" to output the annotations in JSON
8709        format, defaults to fields
8710        :type output_format: str (optional)
8711        :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode`
8712        method is used to specify the prefix that will be added to the output annotations generated
8713        during the calculation process. This prefix helps to differentiate the newly added annotations
8714        from existing ones in the output data. By default, the, defaults to ANN_
8715        :type output_prefix: str (optional)
8716        :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode`
8717        function is used to specify the field in the VCF file that contains SnpEff annotations. This
8718        field will be processed to explode the HGVS annotations and update the variant information
8719        accordingly, defaults to ANN
8720        :type snpeff_field: str (optional)
8721        """
8722
8723        # SnpEff annotation field
8724        snpeff_hgvs = "snpeff_ann_explode"
8725
8726        # Snpeff hgvs tags
8727        vcf_infos_tags = {
8728            snpeff_hgvs: "Explode snpEff annotations",
8729        }
8730
8731        # Prefix
8732        prefix = self.get_explode_infos_prefix()
8733        if prefix:
8734            prefix = "INFO/"
8735
8736        # snpEff fields
8737        speff_ann_infos = prefix + snpeff_field
8738        speff_hgvs_infos = prefix + snpeff_hgvs
8739
8740        # Variants table
8741        table_variants = self.get_table_variants()
8742
8743        # Header
8744        vcf_reader = self.get_header()
8745
8746        # Add columns
8747        added_columns = []
8748
8749        # Explode HGVS field in column
8750        added_columns += self.explode_infos(fields=[snpeff_field])
8751        log.debug(f"snpeff_field={snpeff_field}")
8752        log.debug(f"added_columns={added_columns}")
8753
8754        if snpeff_field in vcf_reader.infos:
8755
8756            # Extract ANN header
8757            ann_description = vcf_reader.infos[snpeff_field].desc
8758            pattern = r"'(.+?)'"
8759            match = re.search(pattern, ann_description)
8760            if match:
8761                ann_header_match = match.group(1).split(" | ")
8762                ann_header = []
8763                ann_header_desc = {}
8764                for i in range(len(ann_header_match)):
8765                    ann_header_info = "".join(
8766                        char for char in ann_header_match[i] if char.isalnum()
8767                    )
8768                    ann_header.append(ann_header_info)
8769                    ann_header_desc[ann_header_info] = ann_header_match[i]
8770                if not ann_header_desc:
8771                    raise ValueError("Invalid header description format")
8772            else:
8773                raise ValueError("Invalid header description format")
8774
8775            # Create variant id
8776            variant_id_column = self.get_variant_id_column()
8777            added_columns += [variant_id_column]
8778
8779            # Create dataframe
8780            dataframe_snpeff_hgvs = self.get_query_to_df(
8781                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
8782            )
8783
8784            # Create snpEff columns
8785            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
8786                speff_ann_infos
8787            ].apply(
8788                lambda x: explode_snpeff_ann(
8789                    str(x),
8790                    uniquify=uniquify,
8791                    output_format=output_format,
8792                    prefix=output_prefix,
8793                    header=list(ann_header_desc.values()),
8794                )
8795            )
8796
8797            # Header
8798            ann_annotations_prefix = ""
8799            if output_format.upper() in ["JSON"]:
8800                ann_annotations_prefix = f"{output_prefix}="
8801                vcf_reader.infos[output_prefix] = vcf.parser._Info(
8802                    output_prefix,
8803                    ".",
8804                    "String",
8805                    vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
8806                    + " - JSON format",
8807                    "howard calculation",
8808                    "0",
8809                    self.code_type_map.get("String"),
8810                )
8811            else:
8812                for ann_annotation in ann_header:
8813                    ann_annotation_id = f"{output_prefix}{ann_annotation}"
8814                    vcf_reader.infos[ann_annotation_id] = vcf.parser._Info(
8815                        ann_annotation_id,
8816                        ".",
8817                        "String",
8818                        vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
8819                        + f" - '{ann_header_desc[ann_annotation]}' annotation",
8820                        "howard calculation",
8821                        "0",
8822                        self.code_type_map.get("String"),
8823                    )
8824
8825            # Update
8826            sql_update = f"""
8827                UPDATE variants
8828                SET "INFO" = 
8829                    concat(
8830                        CASE
8831                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8832                            THEN ''
8833                            ELSE concat("INFO", ';')
8834                        END,
8835                        CASE 
8836                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
8837                                AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
8838                            THEN concat(
8839                                '{ann_annotations_prefix}',
8840                                dataframe_snpeff_hgvs."{speff_hgvs_infos}"
8841                                )
8842                            ELSE ''
8843                        END
8844                    )
8845                FROM dataframe_snpeff_hgvs
8846                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
8847
8848            """
8849            self.conn.execute(sql_update)
8850
8851            # Delete dataframe
8852            del dataframe_snpeff_hgvs
8853            gc.collect()
8854
8855        else:
8856
8857            log.warning(
8858                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
8859            )
8860
8861        # Remove added columns
8862        for added_column in added_columns:
8863            self.drop_column(column=added_column)

The calculation_snpeff_ann_explode function processes SnpEff annotations in a VCF file by exploding the HGVS field and updating variant information accordingly.

Parameters
  • uniquify: The uniquify parameter in the calculation_snpeff_ann_explode method is a boolean flag that determines whether the output should be uniquified or not. When set to True, it indicates that the output should be unique, meaning that duplicate entries should be removed, defaults to True
  • output_format: The output_format parameter in the calculation_snpeff_ann_explode function specifies the format in which the output annotations will be generated. It has a default value of "fields". You can also set it to "JSON" to output the annotations in JSON format, defaults to fields
  • output_prefix: The output_prefix parameter in the calculation_snpeff_ann_explode method is used to specify the prefix that will be added to the output annotations generated during the calculation process. This prefix helps to differentiate the newly added annotations from existing ones in the output data. By default, the, defaults to ANN_
  • snpeff_field: The snpeff_field parameter in the calculation_snpeff_ann_explode function is used to specify the field in the VCF file that contains SnpEff annotations. This field will be processed to explode the HGVS annotations and update the variant information accordingly, defaults to ANN
def calculation_extract_nomen(self) -> None:
8865    def calculation_extract_nomen(self) -> None:
8866        """
8867        This function extracts the HGVS nomenclature from the calculation/identification of NOMEN.
8868        """
8869
8870        # NOMEN field
8871        field_nomen_dict = "NOMEN_DICT"
8872
8873        # NOMEN structure
8874        nomen_dict = {
8875            "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)",
8876            "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)",
8877            "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)",
8878            "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant",
8879            "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)",
8880            "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)",
8881            "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)",
8882            "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)",
8883            "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)",
8884            "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)",
8885        }
8886
8887        # Param
8888        param = self.get_param()
8889
8890        # Prefix
8891        prefix = self.get_explode_infos_prefix()
8892
8893        # Header
8894        vcf_reader = self.get_header()
8895
8896        # Added columns
8897        added_columns = []
8898
8899        # Get HGVS field
8900        hgvs_field = (
8901            param.get("calculation", {})
8902            .get("calculations", {})
8903            .get("NOMEN", {})
8904            .get("options", {})
8905            .get("hgvs_field", "hgvs")
8906        )
8907
8908        # Get NOMEN pattern
8909        nomen_pattern = (
8910            param.get("calculation", {})
8911            .get("calculations", {})
8912            .get("NOMEN", {})
8913            .get("options", {})
8914            .get("pattern", None)
8915        )
8916
8917        # transcripts list of preference sources
8918        transcripts_sources = {}
8919
8920        # Get transcripts
8921        transcripts_file = (
8922            param.get("calculation", {})
8923            .get("calculations", {})
8924            .get("NOMEN", {})
8925            .get("options", {})
8926            .get("transcripts", None)
8927        )
8928        transcripts_file = full_path(transcripts_file)
8929        if transcripts_file:
8930            if os.path.exists(transcripts_file):
8931                transcripts_dataframe = transcripts_file_to_df(transcripts_file)
8932                transcripts_from_file = transcripts_dataframe.iloc[:, 0].tolist()
8933                transcripts_sources["file"] = transcripts_from_file
8934            else:
8935                msg_err = f"Transcript file '{transcripts_file}' does NOT exist"
8936                log.error(msg_err)
8937                raise ValueError(msg_err)
8938
8939        # Get transcripts table
8940        transcripts_table = (
8941            param.get("calculation", {})
8942            .get("calculations", {})
8943            .get("NOMEN", {})
8944            .get("options", {})
8945            .get("transcripts_table", self.get_table_variants())
8946        )
8947        # Get transcripts column
8948        transcripts_column = (
8949            param.get("calculation", {})
8950            .get("calculations", {})
8951            .get("NOMEN", {})
8952            .get("options", {})
8953            .get("transcripts_column", None)
8954        )
8955
8956        if transcripts_table and transcripts_column:
8957            extra_field_transcript = f"{transcripts_table}.{transcripts_column}"
8958            # Explode if not exists
8959            self.explode_infos(fields=[transcripts_column], table=transcripts_table)
8960        else:
8961            extra_field_transcript = f"NULL"
8962
8963        # Transcripts of preference source order
8964        transcripts_order = (
8965            param.get("calculation", {})
8966            .get("calculations", {})
8967            .get("NOMEN", {})
8968            .get("options", {})
8969            .get("transcripts_order", ["column", "file"])
8970        )
8971
8972        # Transcripts from file
8973        transcripts = transcripts_sources.get("file", [])
8974
8975        # Explode HGVS field in column
8976        added_columns += self.explode_infos(fields=[hgvs_field])
8977
8978        # extra infos
8979        extra_infos = self.get_extra_infos()
8980        extra_field = prefix + hgvs_field
8981
8982        if extra_field in extra_infos:
8983
8984            # Create dataframe
8985            dataframe_hgvs = self.get_query_to_df(
8986                f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" AS hgvs, {extra_field_transcript} AS transcript FROM variants """
8987            )
8988
8989            # Create main NOMEN column
8990            dataframe_hgvs[field_nomen_dict] = dataframe_hgvs.apply(
8991                lambda x: find_nomen(
8992                    hgvs=x.hgvs,
8993                    transcript=x.transcript,
8994                    transcripts=transcripts,
8995                    pattern=nomen_pattern,
8996                    transcripts_source_order=transcripts_order,
8997                ),
8998                axis=1,
8999            )
9000
9001            # Explode NOMEN Structure and create SQL set for update
9002            sql_nomen_fields = []
9003            for nomen_field in nomen_dict:
9004
9005                # Explode each field into a column
9006                dataframe_hgvs[nomen_field] = dataframe_hgvs[field_nomen_dict].apply(
9007                    lambda x: dict(x).get(nomen_field, "")
9008                )
9009
9010                # Create VCF header field
9011                vcf_reader.infos[nomen_field] = vcf.parser._Info(
9012                    nomen_field,
9013                    ".",
9014                    "String",
9015                    nomen_dict.get(nomen_field, "howard calculation NOMEN"),
9016                    "howard calculation",
9017                    "0",
9018                    self.code_type_map.get("String"),
9019                )
9020                sql_nomen_fields.append(
9021                    f"""
9022                        CASE 
9023                            WHEN dataframe_hgvs."{nomen_field}" NOT NULL AND dataframe_hgvs."{nomen_field}" NOT IN ('')
9024                            THEN concat(
9025                                    ';{nomen_field}=',
9026                                    dataframe_hgvs."{nomen_field}"
9027                                )
9028                            ELSE ''
9029                        END
9030                    """
9031                )
9032
9033            # SQL set for update
9034            sql_nomen_fields_set = ", ".join(sql_nomen_fields)
9035
9036            # Update
9037            sql_update = f"""
9038                UPDATE variants
9039                SET "INFO" = 
9040                    concat(
9041                        CASE
9042                            WHEN "INFO" IS NULL
9043                            THEN ''
9044                            ELSE "INFO"
9045                        END,
9046                        {sql_nomen_fields_set}
9047                    )
9048                FROM dataframe_hgvs
9049                WHERE variants."#CHROM" = dataframe_hgvs."#CHROM"
9050                    AND variants."POS" = dataframe_hgvs."POS" 
9051                    AND variants."REF" = dataframe_hgvs."REF"
9052                    AND variants."ALT" = dataframe_hgvs."ALT"
9053            """
9054            self.conn.execute(sql_update)
9055
9056            # Delete dataframe
9057            del dataframe_hgvs
9058            gc.collect()
9059
9060        # Remove added columns
9061        for added_column in added_columns:
9062            self.drop_column(column=added_column)

This function extracts the HGVS nomenclature from the calculation/identification of NOMEN.

def calculation_find_by_pipeline(self, tag: str = 'findbypipeline') -> None:
9064    def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None:
9065        """
9066        The function `calculation_find_by_pipeline` performs a calculation to find the number of
9067        pipeline/sample for a variant and updates the variant information in a VCF file.
9068
9069        :param tag: The `tag` parameter is a string that represents the annotation field for the
9070        "findbypipeline" information in the VCF file. It is used to create the annotation field in the
9071        VCF header and to update the corresponding field in the variants table, defaults to
9072        findbypipeline
9073        :type tag: str (optional)
9074        """
9075
9076        # if FORMAT and samples
9077        if (
9078            "FORMAT" in self.get_header_columns_as_list()
9079            and self.get_header_sample_list()
9080        ):
9081
9082            # findbypipeline annotation field
9083            findbypipeline_tag = tag
9084
9085            # VCF infos tags
9086            vcf_infos_tags = {
9087                findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})",
9088            }
9089
9090            # Prefix
9091            prefix = self.get_explode_infos_prefix()
9092
9093            # Field
9094            findbypipeline_infos = prefix + findbypipeline_tag
9095
9096            # Variants table
9097            table_variants = self.get_table_variants()
9098
9099            # Header
9100            vcf_reader = self.get_header()
9101
9102            # Create variant id
9103            variant_id_column = self.get_variant_id_column()
9104            added_columns = [variant_id_column]
9105
9106            # variant_id, FORMAT and samples
9107            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
9108                self.get_header_sample_list()
9109            )
9110
9111            # Create dataframe
9112            dataframe_findbypipeline = self.get_query_to_df(
9113                f""" SELECT {samples_fields} FROM {table_variants} """
9114            )
9115
9116            # Create findbypipeline column
9117            dataframe_findbypipeline[findbypipeline_infos] = (
9118                dataframe_findbypipeline.apply(
9119                    lambda row: findbypipeline(
9120                        row, samples=self.get_header_sample_list()
9121                    ),
9122                    axis=1,
9123                )
9124            )
9125
9126            # Add snpeff_hgvs to header
9127            vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info(
9128                findbypipeline_tag,
9129                ".",
9130                "String",
9131                vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"),
9132                "howard calculation",
9133                "0",
9134                self.code_type_map.get("String"),
9135            )
9136
9137            # Update
9138            sql_update = f"""
9139                UPDATE variants
9140                SET "INFO" = 
9141                    concat(
9142                        CASE
9143                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
9144                            THEN ''
9145                            ELSE concat("INFO", ';')
9146                        END,
9147                        CASE 
9148                            WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.')
9149                                AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL
9150                            THEN concat(
9151                                    '{findbypipeline_tag}=',
9152                                    dataframe_findbypipeline."{findbypipeline_infos}"
9153                                )
9154                            ELSE ''
9155                        END
9156                    )
9157                FROM dataframe_findbypipeline
9158                WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}"
9159            """
9160            self.conn.execute(sql_update)
9161
9162            # Remove added columns
9163            for added_column in added_columns:
9164                self.drop_column(column=added_column)
9165
9166            # Delete dataframe
9167            del dataframe_findbypipeline
9168            gc.collect()

The function calculation_find_by_pipeline performs a calculation to find the number of pipeline/sample for a variant and updates the variant information in a VCF file.

Parameters
  • tag: The tag parameter is a string that represents the annotation field for the "findbypipeline" information in the VCF file. It is used to create the annotation field in the VCF header and to update the corresponding field in the variants table, defaults to findbypipeline
def calculation_genotype_concordance(self) -> None:
9170    def calculation_genotype_concordance(self) -> None:
9171        """
9172        The function `calculation_genotype_concordance` calculates the genotype concordance for
9173        multi-caller VCF files and updates the variant information in the database.
9174        """
9175
9176        # if FORMAT and samples
9177        if (
9178            "FORMAT" in self.get_header_columns_as_list()
9179            and self.get_header_sample_list()
9180        ):
9181
9182            # genotypeconcordance annotation field
9183            genotypeconcordance_tag = "genotypeconcordance"
9184
9185            # VCF infos tags
9186            vcf_infos_tags = {
9187                genotypeconcordance_tag: "Concordance of genotype for multi caller VCF",
9188            }
9189
9190            # Prefix
9191            prefix = self.get_explode_infos_prefix()
9192
9193            # Field
9194            genotypeconcordance_infos = prefix + genotypeconcordance_tag
9195
9196            # Variants table
9197            table_variants = self.get_table_variants()
9198
9199            # Header
9200            vcf_reader = self.get_header()
9201
9202            # Create variant id
9203            variant_id_column = self.get_variant_id_column()
9204            added_columns = [variant_id_column]
9205
9206            # variant_id, FORMAT and samples
9207            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
9208                self.get_header_sample_list()
9209            )
9210
9211            # Create dataframe
9212            dataframe_genotypeconcordance = self.get_query_to_df(
9213                f""" SELECT {samples_fields} FROM {table_variants} """
9214            )
9215
9216            # Create genotypeconcordance column
9217            dataframe_genotypeconcordance[genotypeconcordance_infos] = (
9218                dataframe_genotypeconcordance.apply(
9219                    lambda row: genotypeconcordance(
9220                        row, samples=self.get_header_sample_list()
9221                    ),
9222                    axis=1,
9223                )
9224            )
9225
9226            # Add genotypeconcordance to header
9227            vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info(
9228                genotypeconcordance_tag,
9229                ".",
9230                "String",
9231                vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"),
9232                "howard calculation",
9233                "0",
9234                self.code_type_map.get("String"),
9235            )
9236
9237            # Update
9238            sql_update = f"""
9239                UPDATE variants
9240                SET "INFO" = 
9241                    concat(
9242                        CASE
9243                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
9244                            THEN ''
9245                            ELSE concat("INFO", ';')
9246                        END,
9247                        CASE
9248                            WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.')
9249                                AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL
9250                            THEN concat(
9251                                    '{genotypeconcordance_tag}=',
9252                                    dataframe_genotypeconcordance."{genotypeconcordance_infos}"
9253                                )
9254                            ELSE ''
9255                        END
9256                    )
9257                FROM dataframe_genotypeconcordance
9258                WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}"
9259            """
9260            self.conn.execute(sql_update)
9261
9262            # Remove added columns
9263            for added_column in added_columns:
9264                self.drop_column(column=added_column)
9265
9266            # Delete dataframe
9267            del dataframe_genotypeconcordance
9268            gc.collect()

The function calculation_genotype_concordance calculates the genotype concordance for multi-caller VCF files and updates the variant information in the database.

def calculation_barcode(self, tag: str = 'barcode') -> None:
9270    def calculation_barcode(self, tag: str = "barcode") -> None:
9271        """
9272        The `calculation_barcode` function calculates barcode values for variants in a VCF file and
9273        updates the INFO field in the file with the calculated barcode values.
9274
9275        :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag
9276        name that will be used for the barcode calculation in the VCF file. If no tag name is provided,
9277        the default tag name is set to "barcode", defaults to barcode
9278        :type tag: str (optional)
9279        """
9280
9281        # if FORMAT and samples
9282        if (
9283            "FORMAT" in self.get_header_columns_as_list()
9284            and self.get_header_sample_list()
9285        ):
9286
9287            # barcode annotation field
9288            if not tag:
9289                tag = "barcode"
9290
9291            # VCF infos tags
9292            vcf_infos_tags = {
9293                tag: "barcode calculation (VaRank)",
9294            }
9295
9296            # Prefix
9297            prefix = self.get_explode_infos_prefix()
9298
9299            # Field
9300            barcode_infos = prefix + tag
9301
9302            # Variants table
9303            table_variants = self.get_table_variants()
9304
9305            # Header
9306            vcf_reader = self.get_header()
9307
9308            # Create variant id
9309            variant_id_column = self.get_variant_id_column()
9310            added_columns = [variant_id_column]
9311
9312            # variant_id, FORMAT and samples
9313            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
9314                self.get_header_sample_list()
9315            )
9316
9317            # Create dataframe
9318            dataframe_barcode = self.get_query_to_df(
9319                f""" SELECT {samples_fields} FROM {table_variants} """
9320            )
9321
9322            # Create barcode column
9323            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
9324                lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1
9325            )
9326
9327            # Add barcode to header
9328            vcf_reader.infos[tag] = vcf.parser._Info(
9329                tag,
9330                ".",
9331                "String",
9332                vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)),
9333                "howard calculation",
9334                "0",
9335                self.code_type_map.get("String"),
9336            )
9337
9338            # Update
9339            sql_update = f"""
9340                UPDATE {table_variants}
9341                SET "INFO" = 
9342                    concat(
9343                        CASE
9344                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
9345                            THEN ''
9346                            ELSE concat("INFO", ';')
9347                        END,
9348                        CASE
9349                            WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.')
9350                            AND dataframe_barcode."{barcode_infos}" NOT NULL
9351                            THEN concat(
9352                                    '{tag}=',
9353                                    dataframe_barcode."{barcode_infos}"
9354                                )
9355                            ELSE ''
9356                        END
9357                    )
9358                FROM dataframe_barcode
9359                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
9360            """
9361            self.conn.execute(sql_update)
9362
9363            # Remove added columns
9364            for added_column in added_columns:
9365                self.drop_column(column=added_column)
9366
9367            # Delete dataframe
9368            del dataframe_barcode
9369            gc.collect()

The calculation_barcode function calculates barcode values for variants in a VCF file and updates the INFO field in the file with the calculated barcode values.

Parameters
  • tag: The tag parameter in the calculation_barcode function is used to specify the tag name that will be used for the barcode calculation in the VCF file. If no tag name is provided, the default tag name is set to "barcode", defaults to barcode
def calculation_barcode_family(self, tag: str = 'BCF') -> None:
9371    def calculation_barcode_family(self, tag: str = "BCF") -> None:
9372        """
9373        The `calculation_barcode_family` function calculates barcode values for variants in a VCF file
9374        and updates the INFO field in the file with the calculated barcode values.
9375
9376        :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify
9377        the barcode tag that will be added to the VCF file during the calculation process. If no value
9378        is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF
9379        :type tag: str (optional)
9380        """
9381
9382        # if FORMAT and samples
9383        if (
9384            "FORMAT" in self.get_header_columns_as_list()
9385            and self.get_header_sample_list()
9386        ):
9387
9388            # barcode annotation field
9389            if not tag:
9390                tag = "BCF"
9391
9392            # VCF infos tags
9393            vcf_infos_tags = {
9394                tag: "barcode family calculation",
9395                f"{tag}S": "barcode family samples",
9396            }
9397
9398            # Param
9399            param = self.get_param()
9400            log.debug(f"param={param}")
9401
9402            # Prefix
9403            prefix = self.get_explode_infos_prefix()
9404
9405            # PED param
9406            ped = (
9407                param.get("calculation", {})
9408                .get("calculations", {})
9409                .get("BARCODEFAMILY", {})
9410                .get("family_pedigree", None)
9411            )
9412            log.debug(f"ped={ped}")
9413
9414            # Load PED
9415            if ped:
9416
9417                # Pedigree is a file
9418                if isinstance(ped, str) and os.path.exists(full_path(ped)):
9419                    log.debug("Pedigree is file")
9420                    with open(full_path(ped)) as ped:
9421                        ped = json.load(ped)
9422
9423                # Pedigree is a string
9424                elif isinstance(ped, str):
9425                    log.debug("Pedigree is str")
9426                    try:
9427                        ped = json.loads(ped)
9428                        log.debug("Pedigree is json str")
9429                    except ValueError as e:
9430                        ped_samples = ped.split(",")
9431                        ped = {}
9432                        for ped_sample in ped_samples:
9433                            ped[ped_sample] = ped_sample
9434
9435                # Pedigree is a dict
9436                elif isinstance(ped, dict):
9437                    log.debug("Pedigree is dict")
9438
9439                # Pedigree is not well formatted
9440                else:
9441                    msg_error = "Pedigree not well formatted"
9442                    log.error(msg_error)
9443                    raise ValueError(msg_error)
9444
9445                # Construct list
9446                ped_samples = list(ped.values())
9447
9448            else:
9449                log.debug("Pedigree not defined. Take all samples")
9450                ped_samples = self.get_header_sample_list()
9451                ped = {}
9452                for ped_sample in ped_samples:
9453                    ped[ped_sample] = ped_sample
9454
9455            # Check pedigree
9456            if not ped or len(ped) == 0:
9457                msg_error = f"Error in pedigree: samples {ped_samples}"
9458                log.error(msg_error)
9459                raise ValueError(msg_error)
9460
9461            # Log
9462            log.info(
9463                "Calculation 'BARCODEFAMILY' - Samples: "
9464                + ", ".join([f"{member}='{ped[member]}'" for member in ped])
9465            )
9466            log.debug(f"ped_samples={ped_samples}")
9467
9468            # Field
9469            barcode_infos = prefix + tag
9470
9471            # Variants table
9472            table_variants = self.get_table_variants()
9473
9474            # Header
9475            vcf_reader = self.get_header()
9476
9477            # Create variant id
9478            variant_id_column = self.get_variant_id_column()
9479            added_columns = [variant_id_column]
9480
9481            # variant_id, FORMAT and samples
9482            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
9483                ped_samples
9484            )
9485
9486            # Create dataframe
9487            dataframe_barcode = self.get_query_to_df(
9488                f""" SELECT {samples_fields} FROM {table_variants} """
9489            )
9490
9491            # Create barcode column
9492            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
9493                lambda row: barcode(row, samples=ped_samples), axis=1
9494            )
9495
9496            # Add barcode family to header
9497            # Add vaf_normalization to header
9498            vcf_reader.formats[tag] = vcf.parser._Format(
9499                id=tag,
9500                num=".",
9501                type="String",
9502                desc=vcf_infos_tags.get(tag, "barcode family calculation"),
9503                type_code=self.code_type_map.get("String"),
9504            )
9505            vcf_reader.formats[f"{tag}S"] = vcf.parser._Format(
9506                id=f"{tag}S",
9507                num=".",
9508                type="String",
9509                desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"),
9510                type_code=self.code_type_map.get("String"),
9511            )
9512
9513            # Update
9514            # for sample in ped_samples:
9515            sql_update_set = []
9516            for sample in self.get_header_sample_list() + ["FORMAT"]:
9517                if sample in ped_samples:
9518                    value = f'dataframe_barcode."{barcode_infos}"'
9519                    value_samples = "'" + ",".join(ped_samples) + "'"
9520                elif sample == "FORMAT":
9521                    value = f"'{tag}'"
9522                    value_samples = f"'{tag}S'"
9523                else:
9524                    value = "'.'"
9525                    value_samples = "'.'"
9526                format_regex = r"[a-zA-Z0-9\s]"
9527                sql_update_set.append(
9528                    f"""
9529                        "{sample}" = 
9530                        concat(
9531                            CASE
9532                                WHEN {table_variants}."{sample}" = './.'
9533                                THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g'))
9534                                ELSE {table_variants}."{sample}"
9535                            END,
9536                            ':',
9537                            {value},
9538                            ':',
9539                            {value_samples}
9540                        )
9541                    """
9542                )
9543
9544            sql_update_set_join = ", ".join(sql_update_set)
9545            sql_update = f"""
9546                UPDATE {table_variants}
9547                SET {sql_update_set_join}
9548                FROM dataframe_barcode
9549                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
9550            """
9551            self.conn.execute(sql_update)
9552
9553            # Remove added columns
9554            for added_column in added_columns:
9555                self.drop_column(column=added_column)
9556
9557            # Delete dataframe
9558            del dataframe_barcode
9559            gc.collect()

The calculation_barcode_family function calculates barcode values for variants in a VCF file and updates the INFO field in the file with the calculated barcode values.

Parameters
  • tag: The tag parameter in the calculation_barcode_family function is used to specify the barcode tag that will be added to the VCF file during the calculation process. If no value is provided for the tag parameter, the default value used is "BCF", defaults to BCF
def calculation_trio(self) -> None:
9561    def calculation_trio(self) -> None:
9562        """
9563        The `calculation_trio` function performs trio calculations on a VCF file by adding trio
9564        information to the INFO field of each variant.
9565        """
9566
9567        # if FORMAT and samples
9568        if (
9569            "FORMAT" in self.get_header_columns_as_list()
9570            and self.get_header_sample_list()
9571        ):
9572
9573            # trio annotation field
9574            trio_tag = "trio"
9575
9576            # VCF infos tags
9577            vcf_infos_tags = {
9578                "trio": "trio calculation",
9579            }
9580
9581            # Param
9582            param = self.get_param()
9583
9584            # Prefix
9585            prefix = self.get_explode_infos_prefix()
9586
9587            # Trio param
9588            trio_ped = (
9589                param.get("calculation", {})
9590                .get("calculations", {})
9591                .get("TRIO", {})
9592                .get("trio_pedigree", None)
9593            )
9594
9595            # Load trio
9596            if trio_ped:
9597
9598                # Trio pedigree is a file
9599                if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)):
9600                    log.debug("TRIO pedigree is file")
9601                    with open(full_path(trio_ped)) as trio_ped:
9602                        trio_ped = json.load(trio_ped)
9603
9604                # Trio pedigree is a string
9605                elif isinstance(trio_ped, str):
9606                    log.debug("TRIO pedigree is str")
9607                    try:
9608                        trio_ped = json.loads(trio_ped)
9609                        log.debug("TRIO pedigree is json str")
9610                    except ValueError as e:
9611                        trio_samples = trio_ped.split(",")
9612                        if len(trio_samples) == 3:
9613                            trio_ped = {
9614                                "father": trio_samples[0],
9615                                "mother": trio_samples[1],
9616                                "child": trio_samples[2],
9617                            }
9618                            log.debug("TRIO pedigree is list str")
9619                        else:
9620                            msg_error = "TRIO pedigree not well formatted"
9621                            log.error(msg_error)
9622                            raise ValueError(msg_error)
9623
9624                # Trio pedigree is a dict
9625                elif isinstance(trio_ped, dict):
9626                    log.debug("TRIO pedigree is dict")
9627
9628                # Trio pedigree is not well formatted
9629                else:
9630                    msg_error = "TRIO pedigree not well formatted"
9631                    log.error(msg_error)
9632                    raise ValueError(msg_error)
9633
9634                # Construct trio list
9635                trio_samples = [
9636                    trio_ped.get("father", ""),
9637                    trio_ped.get("mother", ""),
9638                    trio_ped.get("child", ""),
9639                ]
9640
9641            else:
9642                log.debug("TRIO pedigree not defined. Take the first 3 samples")
9643                samples_list = self.get_header_sample_list()
9644                if len(samples_list) >= 3:
9645                    trio_samples = self.get_header_sample_list()[0:3]
9646                    trio_ped = {
9647                        "father": trio_samples[0],
9648                        "mother": trio_samples[1],
9649                        "child": trio_samples[2],
9650                    }
9651                else:
9652                    msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}"
9653                    log.error(msg_error)
9654                    raise ValueError(msg_error)
9655
9656            # Check trio pedigree
9657            if not trio_ped or len(trio_ped) != 3:
9658                msg_error = f"Error in TRIO pedigree: {trio_ped}"
9659                log.error(msg_error)
9660                raise ValueError(msg_error)
9661
9662            # Log
9663            log.info(
9664                f"Calculation 'TRIO' - Samples: "
9665                + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped])
9666            )
9667
9668            # Field
9669            trio_infos = prefix + trio_tag
9670
9671            # Variants table
9672            table_variants = self.get_table_variants()
9673
9674            # Header
9675            vcf_reader = self.get_header()
9676
9677            # Create variant id
9678            variant_id_column = self.get_variant_id_column()
9679            added_columns = [variant_id_column]
9680
9681            # variant_id, FORMAT and samples
9682            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
9683                self.get_header_sample_list()
9684            )
9685
9686            # Create dataframe
9687            dataframe_trio = self.get_query_to_df(
9688                f""" SELECT {samples_fields} FROM {table_variants} """
9689            )
9690
9691            # Create trio column
9692            dataframe_trio[trio_infos] = dataframe_trio.apply(
9693                lambda row: trio(row, samples=trio_samples), axis=1
9694            )
9695
9696            # Add trio to header
9697            vcf_reader.infos[trio_tag] = vcf.parser._Info(
9698                trio_tag,
9699                ".",
9700                "String",
9701                vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"),
9702                "howard calculation",
9703                "0",
9704                self.code_type_map.get("String"),
9705            )
9706
9707            # Update
9708            sql_update = f"""
9709                UPDATE {table_variants}
9710                SET "INFO" = 
9711                    concat(
9712                        CASE
9713                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
9714                            THEN ''
9715                            ELSE concat("INFO", ';')
9716                        END,
9717                        CASE
9718                            WHEN dataframe_trio."{trio_infos}" NOT IN ('','.')
9719                             AND dataframe_trio."{trio_infos}" NOT NULL
9720                            THEN concat(
9721                                    '{trio_tag}=',
9722                                    dataframe_trio."{trio_infos}"
9723                                )
9724                            ELSE ''
9725                        END
9726                    )
9727                FROM dataframe_trio
9728                WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}"
9729            """
9730            self.conn.execute(sql_update)
9731
9732            # Remove added columns
9733            for added_column in added_columns:
9734                self.drop_column(column=added_column)
9735
9736            # Delete dataframe
9737            del dataframe_trio
9738            gc.collect()

The calculation_trio function performs trio calculations on a VCF file by adding trio information to the INFO field of each variant.

def calculation_vaf_normalization(self) -> None:
9740    def calculation_vaf_normalization(self) -> None:
9741        """
9742        The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency)
9743        normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly.
9744        :return: The function does not return anything.
9745        """
9746
9747        # if FORMAT and samples
9748        if (
9749            "FORMAT" in self.get_header_columns_as_list()
9750            and self.get_header_sample_list()
9751        ):
9752
9753            # vaf_normalization annotation field
9754            vaf_normalization_tag = "VAF"
9755
9756            # VCF infos tags
9757            vcf_infos_tags = {
9758                "VAF": "VAF Variant Frequency",
9759            }
9760
9761            # Prefix
9762            prefix = self.get_explode_infos_prefix()
9763
9764            # Variants table
9765            table_variants = self.get_table_variants()
9766
9767            # Header
9768            vcf_reader = self.get_header()
9769
9770            # Do not calculate if VAF already exists
9771            if "VAF" in vcf_reader.formats:
9772                log.debug("VAF already on genotypes")
9773                return
9774
9775            # Create variant id
9776            variant_id_column = self.get_variant_id_column()
9777            added_columns = [variant_id_column]
9778
9779            # variant_id, FORMAT and samples
9780            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
9781                f""" "{sample}" """ for sample in self.get_header_sample_list()
9782            )
9783
9784            # Create dataframe
9785            query = f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """
9786            log.debug(f"query={query}")
9787            dataframe_vaf_normalization = self.get_query_to_df(query=query)
9788
9789            vaf_normalization_set = []
9790
9791            # for each sample vaf_normalization
9792            for sample in self.get_header_sample_list():
9793                dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply(
9794                    lambda row: vaf_normalization(row, sample=sample), axis=1
9795                )
9796                vaf_normalization_set.append(
9797                    f""" "{sample}" = dataframe_vaf_normalization."{sample}" """
9798                )
9799
9800            # Add VAF to FORMAT
9801            dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[
9802                "FORMAT"
9803            ].apply(lambda x: str(x) + ":VAF")
9804            vaf_normalization_set.append(
9805                f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """
9806            )
9807
9808            # Add vaf_normalization to header
9809            vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format(
9810                id=vaf_normalization_tag,
9811                num="1",
9812                type="Float",
9813                desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"),
9814                type_code=self.code_type_map.get("Float"),
9815            )
9816
9817            # Create fields to add in INFO
9818            sql_vaf_normalization_set = " , ".join(vaf_normalization_set)
9819
9820            # Update
9821            sql_update = f"""
9822                UPDATE {table_variants}
9823                SET {sql_vaf_normalization_set}
9824                FROM dataframe_vaf_normalization
9825                WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}"
9826
9827            """
9828            self.conn.execute(sql_update)
9829
9830            # Remove added columns
9831            for added_column in added_columns:
9832                self.drop_column(column=added_column)
9833
9834            # Delete dataframe
9835            del dataframe_vaf_normalization
9836            gc.collect()

The calculation_vaf_normalization function calculates the VAF (Variant Allele Frequency) normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly.

Returns

The function does not return anything.

def calculation_genotype_stats(self, info: str = 'VAF') -> None:
9838    def calculation_genotype_stats(self, info: str = "VAF") -> None:
9839        """
9840        The `calculation_genotype_stats` function calculates genotype statistics for a given information
9841        field in a VCF file and updates the INFO column of the variants table with the calculated
9842        statistics.
9843
9844        :param info: The `info` parameter is a string that represents the type of information for which
9845        genotype statistics are calculated. It is used to generate various VCF info tags for the
9846        statistics, such as the number of occurrences, the list of values, the minimum value, the
9847        maximum value, the mean, the median, defaults to VAF
9848        :type info: str (optional)
9849        """
9850
9851        # if FORMAT and samples
9852        if (
9853            "FORMAT" in self.get_header_columns_as_list()
9854            and self.get_header_sample_list()
9855        ):
9856
9857            # vaf_stats annotation field
9858            vaf_stats_tag = info + "_stats"
9859
9860            # VCF infos tags
9861            vcf_infos_tags = {
9862                info + "_stats_nb": f"genotype {info} Statistics - number of {info}",
9863                info + "_stats_list": f"genotype {info} Statistics - list of {info}",
9864                info + "_stats_min": f"genotype {info} Statistics - min {info}",
9865                info + "_stats_max": f"genotype {info} Statistics - max {info}",
9866                info + "_stats_mean": f"genotype {info} Statistics - mean {info}",
9867                info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}",
9868                info
9869                + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}",
9870            }
9871
9872            # Prefix
9873            prefix = self.get_explode_infos_prefix()
9874
9875            # Field
9876            vaf_stats_infos = prefix + vaf_stats_tag
9877
9878            # Variants table
9879            table_variants = self.get_table_variants()
9880
9881            # Header
9882            vcf_reader = self.get_header()
9883
9884            # Create variant id
9885            variant_id_column = self.get_variant_id_column()
9886            added_columns = [variant_id_column]
9887
9888            # variant_id, FORMAT and samples
9889            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
9890                self.get_header_sample_list()
9891            )
9892
9893            # Create dataframe
9894            dataframe_vaf_stats = self.get_query_to_df(
9895                f""" SELECT {samples_fields} FROM {table_variants} """
9896            )
9897
9898            # Create vaf_stats column
9899            dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply(
9900                lambda row: genotype_stats(
9901                    row, samples=self.get_header_sample_list(), info=info
9902                ),
9903                axis=1,
9904            )
9905
9906            # List of vcf tags
9907            sql_vaf_stats_fields = []
9908
9909            # Check all VAF stats infos
9910            for stat in vcf_infos_tags:
9911
9912                # Extract stats
9913                dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply(
9914                    lambda x: dict(x).get(stat, "")
9915                )
9916
9917                # Add snpeff_hgvs to header
9918                vcf_reader.infos[stat] = vcf.parser._Info(
9919                    stat,
9920                    ".",
9921                    "String",
9922                    vcf_infos_tags.get(stat, "genotype statistics"),
9923                    "howard calculation",
9924                    "0",
9925                    self.code_type_map.get("String"),
9926                )
9927
9928                if len(sql_vaf_stats_fields):
9929                    sep = ";"
9930                else:
9931                    sep = ""
9932
9933                # Create fields to add in INFO
9934                sql_vaf_stats_fields.append(
9935                    f"""
9936                        CASE
9937                            WHEN dataframe_vaf_stats."{stat}" NOT NULL
9938                            THEN concat(
9939                                    '{sep}{stat}=',
9940                                    dataframe_vaf_stats."{stat}"
9941                                )
9942                            ELSE ''
9943                        END
9944                    """
9945                )
9946
9947            # SQL set for update
9948            sql_vaf_stats_fields_set = ",  ".join(sql_vaf_stats_fields)
9949
9950            # Update
9951            sql_update = f"""
9952                UPDATE {table_variants}
9953                SET "INFO" = 
9954                    concat(
9955                        CASE
9956                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
9957                            THEN ''
9958                            ELSE concat("INFO", ';')
9959                        END,
9960                        {sql_vaf_stats_fields_set}
9961                    )
9962                FROM dataframe_vaf_stats
9963                WHERE {table_variants}."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}"
9964
9965            """
9966            self.conn.execute(sql_update)
9967
9968            # Remove added columns
9969            for added_column in added_columns:
9970                self.drop_column(column=added_column)
9971
9972            # Delete dataframe
9973            del dataframe_vaf_stats
9974            gc.collect()

The calculation_genotype_stats function calculates genotype statistics for a given information field in a VCF file and updates the INFO column of the variants table with the calculated statistics.

Parameters
  • info: The info parameter is a string that represents the type of information for which genotype statistics are calculated. It is used to generate various VCF info tags for the statistics, such as the number of occurrences, the list of values, the minimum value, the maximum value, the mean, the median, defaults to VAF
def calculation_transcripts_annotation(self, info_json: str = None, info_format: str = None) -> None:
 9976    def calculation_transcripts_annotation(
 9977        self, info_json: str = None, info_format: str = None
 9978    ) -> None:
 9979        """
 9980        The `calculation_transcripts_annotation` function creates a transcripts table and adds an info
 9981        field to it if transcripts are available.
 9982
 9983        :param info_json: The `info_json` parameter in the `calculation_transcripts_annotation` method
 9984        is a string parameter that represents the information field to be used in the transcripts JSON.
 9985        It is used to specify the JSON format for the transcripts information. If no value is provided
 9986        when calling the method, it defaults to "
 9987        :type info_json: str
 9988        :param info_format: The `info_format` parameter in the `calculation_transcripts_annotation`
 9989        method is a string parameter that specifies the format of the information field to be used in
 9990        the transcripts JSON. It is used to define the format of the information field
 9991        :type info_format: str
 9992        """
 9993
 9994        # Create transcripts table
 9995        transcripts_table = self.create_transcript_view()
 9996
 9997        # Add info field
 9998        if transcripts_table:
 9999            self.transcript_view_to_variants(
10000                transcripts_table=transcripts_table,
10001                transcripts_info_field_json=info_json,
10002                transcripts_info_field_format=info_format,
10003            )
10004        else:
10005            log.info("No Transcripts to process. Check param.json file configuration")

The calculation_transcripts_annotation function creates a transcripts table and adds an info field to it if transcripts are available.

Parameters
  • info_json: The info_json parameter in the calculation_transcripts_annotation method is a string parameter that represents the information field to be used in the transcripts JSON. It is used to specify the JSON format for the transcripts information. If no value is provided when calling the method, it defaults to "
  • info_format: The info_format parameter in the calculation_transcripts_annotation method is a string parameter that specifies the format of the information field to be used in the transcripts JSON. It is used to define the format of the information field
def calculation_transcripts_prioritization(self) -> None:
10007    def calculation_transcripts_prioritization(self) -> None:
10008        """
10009        The function `calculation_transcripts_prioritization` creates a transcripts table and
10010        prioritizes transcripts based on certain criteria.
10011        """
10012
10013        # Create transcripts table
10014        transcripts_table = self.create_transcript_view()
10015
10016        # Add info field
10017        if transcripts_table:
10018            self.transcripts_prioritization(transcripts_table=transcripts_table)
10019        else:
10020            log.info("No Transcripts to process. Check param.json file configuration")

The function calculation_transcripts_prioritization creates a transcripts table and prioritizes transcripts based on certain criteria.

def calculation_transcripts_export(self) -> None:
10022    def calculation_transcripts_export(self) -> None:
10023        """ """
10024
10025        # Create transcripts table
10026        transcripts_table = self.create_transcript_view()
10027
10028        # Add info field
10029        if transcripts_table:
10030            self.transcripts_export(transcripts_table=transcripts_table)
10031        else:
10032            log.info("No Transcripts to process. Check param.json file configuration")
def transcripts_export(self, transcripts_table: str = None, param: dict = {}) -> bool:
10038    def transcripts_export(
10039        self, transcripts_table: str = None, param: dict = {}
10040    ) -> bool:
10041        """ """
10042
10043        log.debug("Start transcripts export...")
10044
10045        # Param
10046        if not param:
10047            param = self.get_param()
10048
10049        # Param export
10050        param_transcript_export = param.get("transcripts", {}).get("export", {})
10051
10052        # Output file
10053        transcripts_export_output = param_transcript_export.get("output", None)
10054
10055        if not param_transcript_export or not transcripts_export_output:
10056            log.warning(f"No transcriipts export parameters defined!")
10057            return False
10058
10059        # List of transcripts annotations
10060        query_describe = f"""
10061            SELECT column_name
10062            FROM (
10063                    DESCRIBE SELECT * FROM {transcripts_table}
10064                )
10065            WHERE column_name NOT IN ('#CHROM', 'POS', 'REF', 'ALT', 'INFO')
10066        """
10067        transcripts_annotations_list = list(
10068            self.get_query_to_df(query=query_describe)["column_name"]
10069        )
10070
10071        # Create transcripts table for export
10072        transcripts_table_export = f"{transcripts_table}_export_" + "".join(
10073            random.choices(string.ascii_uppercase + string.digits, k=10)
10074        )
10075        query_create_transcripts_table_export = f"""
10076            CREATE TABLE {transcripts_table_export} AS (SELECT "#CHROM", "POS", "REF", "ALT", '' AS 'INFO', {', '.join(transcripts_annotations_list)} FROM {transcripts_table})
10077        """
10078        self.execute_query(query=query_create_transcripts_table_export)
10079
10080        # Output file format
10081        transcripts_export_output_format = get_file_format(
10082            filename=transcripts_export_output
10083        )
10084
10085        # Format VCF - construct INFO
10086        if transcripts_export_output_format in ["vcf"]:
10087
10088            # Construct query update INFO and header
10089            query_update_info = []
10090            for field in transcripts_annotations_list:
10091
10092                # If field not in header
10093                if field not in self.get_header_infos_list():
10094
10095                    # Add PZ Transcript in header
10096                    self.get_header().infos[field] = vcf.parser._Info(
10097                        field,
10098                        ".",
10099                        "String",
10100                        f"Annotation '{field}' from transcript view",
10101                        "unknown",
10102                        "unknown",
10103                        0,
10104                    )
10105
10106                # Add field as INFO/tag
10107                query_update_info.append(
10108                    f"""
10109                        CASE
10110                            WHEN "{field}" IS NOT NULL
10111                            THEN concat('{field}=', "{field}", ';')    
10112                            ELSE ''     
10113                        END
10114                        """
10115                )
10116
10117            # Query param
10118            query_update_info_value = (
10119                f""" concat('',  {", ".join(query_update_info)}) """
10120            )
10121            query_export_columns = f""" "#CHROM", "POS", '.' AS 'ID', "REF", "ALT", '.' AS 'QUAL', '.' AS 'FILTER', "INFO" """
10122
10123        else:
10124
10125            # Query param
10126            query_update_info_value = f""" NULL """
10127            query_export_columns = f""" "#CHROM", "POS", "REF", "ALT", {', '.join(transcripts_annotations_list)} """
10128
10129        # Update query INFO column
10130        query_update = f"""
10131            UPDATE {transcripts_table_export}
10132            SET INFO = {query_update_info_value}
10133
10134        """
10135        self.execute_query(query=query_update)
10136
10137        # Export
10138        self.export_output(
10139            output_file=transcripts_export_output,
10140            query=f""" SELECT {query_export_columns} FROM {transcripts_table_export} """,
10141        )
10142
10143        # Drop transcripts export table
10144        query_drop_transcripts_table_export = f"""
10145            DROP TABLE {transcripts_table_export}
10146        """
10147        self.execute_query(query=query_drop_transcripts_table_export)
def transcripts_prioritization(self, transcripts_table: str = None, param: dict = {}) -> bool:
10149    def transcripts_prioritization(
10150        self, transcripts_table: str = None, param: dict = {}
10151    ) -> bool:
10152        """
10153        The `transcripts_prioritization` function prioritizes transcripts based on certain parameters
10154        and updates the variants table with the prioritized information.
10155
10156        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name
10157        of the table containing transcripts data. If no value is provided, it defaults to "transcripts".
10158        This parameter is used to identify the table where the transcripts data is stored for the
10159        prioritization process
10160        :type transcripts_table: str
10161        :param param: The `param` parameter in the `transcripts_prioritization` method is a dictionary
10162        that contains various configuration settings for the prioritization process of transcripts. It
10163        is used to customize the behavior of the prioritization algorithm and includes settings such as
10164        the prefix for prioritization fields, default profiles, and other
10165        :type param: dict
10166        :return: The function `transcripts_prioritization` returns a boolean value `True` if the
10167        transcripts prioritization process is successfully completed, and `False` if there are any
10168        issues or if no profile is defined for transcripts prioritization.
10169        """
10170
10171        log.debug("Start transcripts prioritization...")
10172
10173        # Param
10174        if not param:
10175            param = self.get_param()
10176
10177        # Variants table
10178        table_variants = self.get_table_variants()
10179
10180        # Transcripts table
10181        if transcripts_table is None:
10182            transcripts_table = self.create_transcript_view(
10183                transcripts_table="transcripts", param=param
10184            )
10185        if transcripts_table is None:
10186            msg_err = "No Transcripts table availalble"
10187            log.error(msg_err)
10188            raise ValueError(msg_err)
10189        log.debug(f"transcripts_table={transcripts_table}")
10190
10191        # Get transcripts columns
10192        columns_as_list_query = f"""
10193            DESCRIBE {transcripts_table}
10194        """
10195        columns_as_list = list(
10196            self.get_query_to_df(columns_as_list_query)["column_name"]
10197        )
10198
10199        # Create INFO if not exists
10200        if "INFO" not in columns_as_list:
10201            query_add_info = f"""
10202                ALTER TABLE {transcripts_table} ADD COLUMN INFO STRING DEFAULT '';
10203            """
10204            self.execute_query(query_add_info)
10205
10206        # Prioritization param and Force only PZ Score and Flag
10207        pz_param = param.get("transcripts", {}).get("prioritization", {})
10208
10209        # PZ profile by default
10210        pz_profile_default = (
10211            param.get("transcripts", {}).get("prioritization", {}).get("profiles", None)
10212        )
10213
10214        # Exit if no profile
10215        if pz_profile_default is None:
10216            log.warning("No profile defined for transcripts prioritization")
10217            return False
10218
10219        # PZ fields
10220        pz_param_pzfields = {}
10221
10222        # PZ field transcripts
10223        pz_fields_transcripts = pz_param.get("pzprefix", "PTZ") + "Transcript"
10224
10225        # Add PZ Transcript in header
10226        self.get_header().infos[pz_fields_transcripts] = vcf.parser._Info(
10227            pz_fields_transcripts,
10228            ".",
10229            "String",
10230            f"Transcript selected from prioritization process, profile {pz_profile_default}",
10231            "unknown",
10232            "unknown",
10233            code_type_map["String"],
10234        )
10235
10236        # Mandatory fields
10237        pz_mandatory_fields_list = [
10238            "Score",
10239            "Flag",
10240            "Tags",
10241            "Comment",
10242            "Infos",
10243            "Class",
10244        ]
10245        pz_mandatory_fields = []
10246        for pz_mandatory_field in pz_mandatory_fields_list:
10247            pz_mandatory_fields.append(
10248                pz_param.get("pzprefix", "PTZ") + pz_mandatory_field
10249            )
10250
10251        # PZ fields in param
10252        for pz_field in pz_param.get("pzfields", []):
10253            if pz_field in pz_mandatory_fields_list:
10254                pz_param_pzfields[pz_param.get("pzprefix", "PTZ") + pz_field] = (
10255                    pz_param.get("pzprefix", "PTZ") + pz_field
10256                )
10257            else:
10258                pz_field_new = pz_param.get("pzprefix", "PTZ") + pz_field
10259                pz_param_pzfields[pz_field] = pz_field_new
10260
10261                # Add PZ Transcript in header
10262                self.get_header().infos[pz_field_new] = vcf.parser._Info(
10263                    pz_field_new,
10264                    ".",
10265                    "String",
10266                    f"Annotation '{pz_field}' from transcript selected from prioritization process, profile {pz_profile_default}",
10267                    "unknown",
10268                    "unknown",
10269                    code_type_map["String"],
10270                )
10271
10272        # PZ fields param
10273        pz_param["pzfields"] = pz_mandatory_fields
10274
10275        # Prioritization
10276        prioritization_result = self.prioritization(
10277            table=transcripts_table,
10278            pz_param=param.get("transcripts", {}).get("prioritization", {}),
10279        )
10280        if not prioritization_result:
10281            log.warning("Transcripts prioritization not processed")
10282            return False
10283
10284        # PZ fields sql query
10285        query_update_select_list = []
10286        query_update_concat_list = []
10287        query_update_order_list = []
10288        for pz_param_pzfield in set(
10289            list(pz_param_pzfields.keys()) + pz_mandatory_fields
10290        ):
10291            query_update_select_list.append(f" {pz_param_pzfield}, ")
10292
10293        for pz_param_pzfield in pz_param_pzfields:
10294            query_update_concat_list.append(
10295                f"""
10296                    , CASE 
10297                        WHEN {pz_param_pzfield} IS NOT NULL
10298                        THEN concat(';{pz_param_pzfields.get(pz_param_pzfield)}=', {pz_param_pzfield})
10299                        ELSE ''
10300                    END
10301                """
10302            )
10303
10304        # Order by
10305        pz_orders = (
10306            param.get("transcripts", {})
10307            .get("prioritization", {})
10308            .get("prioritization_transcripts_order", {})
10309        )
10310        if not pz_orders:
10311            pz_orders = {
10312                pz_param.get("pzprefix", "PTZ") + "Flag": "ASC",
10313                pz_param.get("pzprefix", "PTZ") + "Score": "DESC",
10314            }
10315        for pz_order in pz_orders:
10316            query_update_order_list.append(
10317                f""" {pz_order} {pz_orders.get(pz_order, "DESC")} """
10318            )
10319
10320        # Fields to explode
10321        fields_to_explode = (
10322            list(pz_param_pzfields.keys())
10323            + pz_mandatory_fields
10324            + list(pz_orders.keys())
10325        )
10326        # Remove transcript column as a specific transcript column
10327        if "transcript" in fields_to_explode:
10328            fields_to_explode.remove("transcript")
10329
10330        # Fields intranscripts table
10331        query_transcripts_table = f"""
10332            DESCRIBE SELECT * FROM {transcripts_table}
10333        """
10334        query_transcripts_table = self.get_query_to_df(query=query_transcripts_table)
10335
10336        # Check fields to explode
10337        for field_to_explode in fields_to_explode:
10338            if field_to_explode not in self.get_header_infos_list() + list(
10339                query_transcripts_table.column_name
10340            ):
10341                msg_err = f"INFO/{field_to_explode} NOT IN header"
10342                log.error(msg_err)
10343                raise ValueError(msg_err)
10344
10345        # Explode fields to explode
10346        self.explode_infos(
10347            table=transcripts_table,
10348            fields=fields_to_explode,
10349        )
10350
10351        # Transcript preference file
10352        transcripts_preference_file = (
10353            param.get("transcripts", {})
10354            .get("prioritization", {})
10355            .get("prioritization_transcripts", {})
10356        )
10357        transcripts_preference_file = full_path(transcripts_preference_file)
10358
10359        # Transcript preference forced
10360        transcript_preference_force = (
10361            param.get("transcripts", {})
10362            .get("prioritization", {})
10363            .get("prioritization_transcripts_force", False)
10364        )
10365        # Transcript version forced
10366        transcript_version_force = (
10367            param.get("transcripts", {})
10368            .get("prioritization", {})
10369            .get("prioritization_transcripts_version_force", False)
10370        )
10371
10372        # Transcripts Ranking
10373        if transcripts_preference_file:
10374
10375            # Transcripts file to dataframe
10376            if os.path.exists(transcripts_preference_file):
10377                transcripts_preference_dataframe = transcripts_file_to_df(
10378                    transcripts_preference_file
10379                )
10380            else:
10381                log.error(
10382                    f"Transcript file '{transcripts_preference_file}' does NOT exist"
10383                )
10384                raise ValueError(
10385                    f"Transcript file '{transcripts_preference_file}' does NOT exist"
10386                )
10387
10388            # Order by depending to transcript preference forcing
10389            if transcript_preference_force:
10390                order_by = f""" transcripts_preference.transcripts_preference_order ASC, {" , ".join(query_update_order_list)} """
10391            else:
10392                order_by = f""" {" , ".join(query_update_order_list)}, transcripts_preference.transcripts_preference_order ASC """
10393
10394            # Transcript columns joined depend on version consideration
10395            if transcript_version_force:
10396                transcripts_version_join = f""" {transcripts_table}.transcript = transcripts_preference.transcripts_preference """
10397            else:
10398                transcripts_version_join = f""" split_part({transcripts_table}.transcript, '.', 1) = split_part(transcripts_preference.transcripts_preference, '.', 1) """
10399
10400            # Query ranking for update
10401            query_update_ranking = f"""
10402                SELECT
10403                    "#CHROM", POS, REF, ALT, {transcripts_table}.transcript, {" ".join(query_update_select_list)}
10404                    ROW_NUMBER() OVER (
10405                        PARTITION BY "#CHROM", POS, REF, ALT
10406                        ORDER BY {order_by}
10407                    ) AS rn
10408                FROM {transcripts_table}
10409                LEFT JOIN 
10410                    (
10411                        SELECT transcript AS 'transcripts_preference', row_number() OVER () AS transcripts_preference_order
10412                        FROM transcripts_preference_dataframe
10413                    ) AS transcripts_preference
10414                ON {transcripts_version_join}
10415            """
10416
10417        else:
10418
10419            # Query ranking for update
10420            query_update_ranking = f"""
10421                SELECT
10422                    "#CHROM", POS, REF, ALT, transcript, {" ".join(query_update_select_list)}
10423                    ROW_NUMBER() OVER (
10424                        PARTITION BY "#CHROM", POS, REF, ALT
10425                        ORDER BY {" , ".join(query_update_order_list)}
10426                    ) AS rn
10427                FROM {transcripts_table}
10428            """
10429
10430        # Export Transcripts prioritization infos to variants table
10431        query_update = f"""
10432            WITH RankedTranscripts AS (
10433                {query_update_ranking}
10434            )
10435            UPDATE {table_variants}
10436                SET
10437                INFO = CONCAT(CASE
10438                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
10439                            THEN ''
10440                            ELSE concat("INFO", ';')
10441                        END,
10442                        concat('{pz_fields_transcripts}=', transcript {" ".join(query_update_concat_list)})
10443                        )
10444            FROM
10445                RankedTranscripts
10446            WHERE
10447                rn = 1
10448                AND variants."#CHROM" = RankedTranscripts."#CHROM"
10449                AND variants."POS" = RankedTranscripts."POS"
10450                AND variants."REF" = RankedTranscripts."REF"
10451                AND variants."ALT" = RankedTranscripts."ALT"     
10452        """
10453
10454        # log.debug(f"query_update={query_update}")
10455        self.execute_query(query=query_update)
10456
10457        # Return
10458        return True

The transcripts_prioritization function prioritizes transcripts based on certain parameters and updates the variants table with the prioritized information.

Parameters
  • transcripts_table: The transcripts_table parameter is a string that specifies the name of the table containing transcripts data. If no value is provided, it defaults to "transcripts". This parameter is used to identify the table where the transcripts data is stored for the prioritization process
  • param: The param parameter in the transcripts_prioritization method is a dictionary that contains various configuration settings for the prioritization process of transcripts. It is used to customize the behavior of the prioritization algorithm and includes settings such as the prefix for prioritization fields, default profiles, and other
Returns

The function transcripts_prioritization returns a boolean value True if the transcripts prioritization process is successfully completed, and False if there are any issues or if no profile is defined for transcripts prioritization.

def create_transcript_view_from_columns_map( self, transcripts_table: str = 'transcripts', columns_maps: dict = {}, added_columns: list = [], temporary_tables: list = None, annotation_fields: list = None, column_rename: dict = {}, column_clean: bool = False, column_case: str = None) -> tuple[list, list, list]:
10460    def create_transcript_view_from_columns_map(
10461        self,
10462        transcripts_table: str = "transcripts",
10463        columns_maps: dict = {},
10464        added_columns: list = [],
10465        temporary_tables: list = None,
10466        annotation_fields: list = None,
10467        column_rename: dict = {},
10468        column_clean: bool = False,
10469        column_case: str = None,
10470    ) -> tuple[list, list, list]:
10471        """
10472        The `create_transcript_view_from_columns_map` function generates a temporary table view based on
10473        specified columns mapping for transcripts data.
10474
10475        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name
10476        of the table where the transcripts data is stored or will be stored in the database. This table
10477        typically contains information about transcripts such as Ensembl transcript IDs, gene names,
10478        scores, predictions, etc. It defaults to "transcripts, defaults to transcripts
10479        :type transcripts_table: str (optional)
10480        :param columns_maps: The `columns_maps` parameter is a dictionary that contains information
10481        about how to map columns from a transcripts table to create a view. Each entry in the
10482        `columns_maps` list represents a mapping configuration for a specific set of columns. It
10483        typically includes details such as the main transcript column and additional information columns
10484        :type columns_maps: dict
10485        :param added_columns: The `added_columns` parameter in the
10486        `create_transcript_view_from_columns_map` function is a list that stores the additional columns
10487        that will be added to the view being created based on the columns map provided. These columns
10488        are generated by exploding the transcript information columns along with the main transcript
10489        column
10490        :type added_columns: list
10491        :param temporary_tables: The `temporary_tables` parameter in the
10492        `create_transcript_view_from_columns_map` function is a list that stores the names of temporary
10493        tables created during the process of creating a transcript view from a columns map. These
10494        temporary tables are used to store intermediate results or transformations before the final view
10495        is generated
10496        :type temporary_tables: list
10497        :param annotation_fields: The `annotation_fields` parameter in the
10498        `create_transcript_view_from_columns_map` function is a list that stores the fields that are
10499        used for annotation in the query view creation process. These fields are extracted from the
10500        `transcripts_column` and `transcripts_infos_columns` specified in the `columns
10501        :type annotation_fields: list
10502        :param column_rename: The `column_rename` parameter in the
10503        `create_transcript_view_from_columns_map` function is a dictionary that allows you to specify
10504        custom renaming for columns during the creation of the temporary table view. This parameter
10505        provides a mapping of original column names to the desired renamed column names. By using this
10506        parameter,
10507        :type column_rename: dict
10508        :param column_clean: The `column_clean` parameter in the
10509        `create_transcript_view_from_columns_map` function is a boolean flag that determines whether the
10510        column values should be cleaned or not. If set to `True`, the column values will be cleaned by
10511        removing any non-alphanumeric characters from them. This cleaning process ensures, defaults to
10512        False
10513        :type column_clean: bool (optional)
10514        :param column_case: The `column_case` parameter in the `create_transcript_view_from_columns_map`
10515        function is used to specify the case transformation to be applied to the columns during the view
10516        creation process. It allows you to control whether the column values should be converted to
10517        lowercase, uppercase, or remain unchanged
10518        :type column_case: str
10519        :return: The `create_transcript_view_from_columns_map` function returns a tuple containing three
10520        lists: `added_columns`, `temporary_tables`, and `annotation_fields`.
10521        """
10522
10523        log.debug("Start transcrpts view creation from columns map...")
10524
10525        # "from_columns_map": [
10526        #     {
10527        #         "transcripts_column": "Ensembl_transcriptid",
10528        #         "transcripts_infos_columns": [
10529        #             "genename",
10530        #             "Ensembl_geneid",
10531        #             "LIST_S2_score",
10532        #             "LIST_S2_pred",
10533        #         ],
10534        #     },
10535        #     {
10536        #         "transcripts_column": "Ensembl_transcriptid",
10537        #         "transcripts_infos_columns": [
10538        #             "genename",
10539        #             "VARITY_R_score",
10540        #             "Aloft_pred",
10541        #         ],
10542        #     },
10543        # ],
10544
10545        # Init
10546        if temporary_tables is None:
10547            temporary_tables = []
10548        if annotation_fields is None:
10549            annotation_fields = []
10550
10551        # Variants table
10552        table_variants = self.get_table_variants()
10553
10554        for columns_map in columns_maps:
10555
10556            # Transcript column
10557            transcripts_column = columns_map.get("transcripts_column", None)
10558
10559            # Transcripts infos columns
10560            transcripts_infos_columns = columns_map.get("transcripts_infos_columns", [])
10561
10562            # Transcripts infos columns rename
10563            column_rename = columns_map.get("column_rename", column_rename)
10564
10565            # Transcripts infos columns clean
10566            column_clean = columns_map.get("column_clean", column_clean)
10567
10568            # Transcripts infos columns case
10569            column_case = columns_map.get("column_case", column_case)
10570
10571            if transcripts_column is not None:
10572
10573                # Explode
10574                added_columns += self.explode_infos(
10575                    fields=[transcripts_column] + transcripts_infos_columns
10576                )
10577
10578                # View clauses
10579                clause_select_variants = []
10580                clause_select_tanscripts = []
10581                for field in [transcripts_column] + transcripts_infos_columns:
10582
10583                    # AS field
10584                    as_field = field
10585
10586                    # Rename
10587                    if column_rename:
10588                        as_field = column_rename.get(as_field, as_field)
10589
10590                    # Clean
10591                    if column_clean:
10592                        as_field = clean_annotation_field(as_field)
10593
10594                    # Case
10595                    if column_case:
10596                        if column_case.lower() in ["lower"]:
10597                            as_field = as_field.lower()
10598                        elif column_case.lower() in ["upper"]:
10599                            as_field = as_field.upper()
10600
10601                    # Clause select Variants
10602                    clause_select_variants.append(
10603                        f""" regexp_split_to_table("{field}", ',') AS '{field}' """
10604                    )
10605
10606                    if field in [transcripts_column]:
10607                        clause_select_tanscripts.append(
10608                            f""" regexp_split_to_table("{field}", ',') AS '{field}' """
10609                        )
10610                    else:
10611                        clause_select_tanscripts.append(
10612                            f""" regexp_split_to_table("{field}", ',') AS '{as_field}' """
10613                        )
10614                        annotation_fields.append(as_field)
10615
10616                # Querey View
10617                query = f""" 
10618                    SELECT
10619                        "#CHROM", POS, REF, ALT, INFO,
10620                        "{transcripts_column}" AS 'transcript',
10621                        {", ".join(clause_select_tanscripts)}
10622                    FROM (
10623                        SELECT 
10624                            "#CHROM", POS, REF, ALT, INFO,
10625                            {", ".join(clause_select_variants)}
10626                        FROM {table_variants}
10627                        )
10628                    WHERE "{transcripts_column}" IS NOT NULL
10629                """
10630
10631                # Create temporary table
10632                temporary_table = transcripts_table + "".join(
10633                    random.choices(string.ascii_uppercase + string.digits, k=10)
10634                )
10635
10636                # Temporary_tables
10637                temporary_tables.append(temporary_table)
10638                query_view = f"""
10639                    CREATE TEMPORARY TABLE {temporary_table}
10640                    AS ({query})
10641                """
10642                self.execute_query(query=query_view)
10643
10644        return added_columns, temporary_tables, annotation_fields

The create_transcript_view_from_columns_map function generates a temporary table view based on specified columns mapping for transcripts data.

Parameters
  • transcripts_table: The transcripts_table parameter is a string that specifies the name of the table where the transcripts data is stored or will be stored in the database. This table typically contains information about transcripts such as Ensembl transcript IDs, gene names, scores, predictions, etc. It defaults to "transcripts, defaults to transcripts
  • columns_maps: The columns_maps parameter is a dictionary that contains information about how to map columns from a transcripts table to create a view. Each entry in the columns_maps list represents a mapping configuration for a specific set of columns. It typically includes details such as the main transcript column and additional information columns
  • added_columns: The added_columns parameter in the create_transcript_view_from_columns_map function is a list that stores the additional columns that will be added to the view being created based on the columns map provided. These columns are generated by exploding the transcript information columns along with the main transcript column
  • temporary_tables: The temporary_tables parameter in the create_transcript_view_from_columns_map function is a list that stores the names of temporary tables created during the process of creating a transcript view from a columns map. These temporary tables are used to store intermediate results or transformations before the final view is generated
  • annotation_fields: The annotation_fields parameter in the create_transcript_view_from_columns_map function is a list that stores the fields that are used for annotation in the query view creation process. These fields are extracted from the transcripts_column and transcripts_infos_columns specified in the `columns
  • column_rename: The column_rename parameter in the create_transcript_view_from_columns_map function is a dictionary that allows you to specify custom renaming for columns during the creation of the temporary table view. This parameter provides a mapping of original column names to the desired renamed column names. By using this parameter,
  • column_clean: The column_clean parameter in the create_transcript_view_from_columns_map function is a boolean flag that determines whether the column values should be cleaned or not. If set to True, the column values will be cleaned by removing any non-alphanumeric characters from them. This cleaning process ensures, defaults to False
  • column_case: The column_case parameter in the create_transcript_view_from_columns_map function is used to specify the case transformation to be applied to the columns during the view creation process. It allows you to control whether the column values should be converted to lowercase, uppercase, or remain unchanged
Returns

The create_transcript_view_from_columns_map function returns a tuple containing three lists: added_columns, temporary_tables, and annotation_fields.

def create_transcript_view_from_column_format( self, transcripts_table: str = 'transcripts', column_formats: dict = {}, temporary_tables: list = None, annotation_fields: list = None, column_rename: dict = {}, column_clean: bool = False, column_case: str = None) -> tuple[list, list, list]:
10646    def create_transcript_view_from_column_format(
10647        self,
10648        transcripts_table: str = "transcripts",
10649        column_formats: dict = {},
10650        temporary_tables: list = None,
10651        annotation_fields: list = None,
10652        column_rename: dict = {},
10653        column_clean: bool = False,
10654        column_case: str = None,
10655    ) -> tuple[list, list, list]:
10656        """
10657        The `create_transcript_view_from_column_format` function generates a transcript view based on
10658        specified column formats, adds additional columns and annotation fields, and returns the list of
10659        temporary tables and annotation fields.
10660
10661        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name
10662        of the table containing the transcripts data. This table will be used as the base table for
10663        creating the transcript view. The default value for this parameter is "transcripts", but you can
10664        provide a different table name if needed, defaults to transcripts
10665        :type transcripts_table: str (optional)
10666        :param column_formats: The `column_formats` parameter is a dictionary that contains information
10667        about the columns to be used for creating the transcript view. Each entry in the dictionary
10668        specifies the mapping between a transcripts column and a transcripts infos column. This
10669        parameter allows you to define how the columns from the transcripts table should be transformed
10670        or mapped
10671        :type column_formats: dict
10672        :param temporary_tables: The `temporary_tables` parameter in the
10673        `create_transcript_view_from_column_format` function is a list that stores the names of
10674        temporary views created during the process of creating a transcript view from a column format.
10675        These temporary views are used to manipulate and extract data before generating the final
10676        transcript view
10677        :type temporary_tables: list
10678        :param annotation_fields: The `annotation_fields` parameter in the
10679        `create_transcript_view_from_column_format` function is a list that stores the annotation fields
10680        that are extracted from the temporary views created during the process. These annotation fields
10681        are obtained by querying the temporary views and extracting the column names excluding specific
10682        columns like `#CH
10683        :type annotation_fields: list
10684        :param column_rename: The `column_rename` parameter in the
10685        `create_transcript_view_from_column_format` function is a dictionary that allows you to specify
10686        custom renaming of columns in the transcripts infos table. By providing a mapping of original
10687        column names to new column names in this dictionary, you can rename specific columns during the
10688        process
10689        :type column_rename: dict
10690        :param column_clean: The `column_clean` parameter in the
10691        `create_transcript_view_from_column_format` function is a boolean flag that determines whether
10692        the transcripts infos columns should undergo a cleaning process. If set to `True`, the columns
10693        will be cleaned during the creation of the transcript view based on the specified column format,
10694        defaults to False
10695        :type column_clean: bool (optional)
10696        :param column_case: The `column_case` parameter in the
10697        `create_transcript_view_from_column_format` function is used to specify the case transformation
10698        to be applied to the columns in the transcript view. It can be set to either "upper" or "lower"
10699        to convert the column names to uppercase or lowercase, respectively
10700        :type column_case: str
10701        :return: The `create_transcript_view_from_column_format` function returns two lists:
10702        `temporary_tables` and `annotation_fields`.
10703        """
10704
10705        log.debug("Start transcrpts view creation from column format...")
10706
10707        #  "from_column_format": [
10708        #     {
10709        #         "transcripts_column": "ANN",
10710        #         "transcripts_infos_column": "Feature_ID",
10711        #     }
10712        # ],
10713
10714        # Init
10715        if temporary_tables is None:
10716            temporary_tables = []
10717        if annotation_fields is None:
10718            annotation_fields = []
10719
10720        for column_format in column_formats:
10721
10722            # annotation field and transcript annotation field
10723            annotation_field = column_format.get("transcripts_column", "ANN")
10724            transcript_annotation = column_format.get(
10725                "transcripts_infos_column", "Feature_ID"
10726            )
10727
10728            # Transcripts infos columns rename
10729            column_rename = column_format.get("column_rename", column_rename)
10730
10731            # Transcripts infos columns clean
10732            column_clean = column_format.get("column_clean", column_clean)
10733
10734            # Transcripts infos columns case
10735            column_case = column_format.get("column_case", column_case)
10736
10737            # Temporary View name
10738            temporary_view_name = transcripts_table + "".join(
10739                random.choices(string.ascii_uppercase + string.digits, k=10)
10740            )
10741
10742            # Create temporary view name
10743            temporary_view_name = self.annotation_format_to_table(
10744                uniquify=True,
10745                annotation_field=annotation_field,
10746                view_name=temporary_view_name,
10747                annotation_id=transcript_annotation,
10748                column_rename=column_rename,
10749                column_clean=column_clean,
10750                column_case=column_case,
10751            )
10752
10753            # Annotation fields
10754            if temporary_view_name:
10755                query_annotation_fields = f"""
10756                    SELECT *
10757                    FROM (
10758                        DESCRIBE SELECT *
10759                        FROM {temporary_view_name}
10760                        )
10761                        WHERE column_name not in ('#CHROM', 'POS', 'REF', 'ALT')
10762                """
10763                df_annotation_fields = self.get_query_to_df(
10764                    query=query_annotation_fields
10765                )
10766
10767                # Add temporary view and annotation fields
10768                temporary_tables.append(temporary_view_name)
10769                annotation_fields += list(set(df_annotation_fields["column_name"]))
10770
10771        return temporary_tables, annotation_fields

The create_transcript_view_from_column_format function generates a transcript view based on specified column formats, adds additional columns and annotation fields, and returns the list of temporary tables and annotation fields.

Parameters
  • transcripts_table: The transcripts_table parameter is a string that specifies the name of the table containing the transcripts data. This table will be used as the base table for creating the transcript view. The default value for this parameter is "transcripts", but you can provide a different table name if needed, defaults to transcripts
  • column_formats: The column_formats parameter is a dictionary that contains information about the columns to be used for creating the transcript view. Each entry in the dictionary specifies the mapping between a transcripts column and a transcripts infos column. This parameter allows you to define how the columns from the transcripts table should be transformed or mapped
  • temporary_tables: The temporary_tables parameter in the create_transcript_view_from_column_format function is a list that stores the names of temporary views created during the process of creating a transcript view from a column format. These temporary views are used to manipulate and extract data before generating the final transcript view
  • annotation_fields: The annotation_fields parameter in the create_transcript_view_from_column_format function is a list that stores the annotation fields that are extracted from the temporary views created during the process. These annotation fields are obtained by querying the temporary views and extracting the column names excluding specific columns like `#CH
  • column_rename: The column_rename parameter in the create_transcript_view_from_column_format function is a dictionary that allows you to specify custom renaming of columns in the transcripts infos table. By providing a mapping of original column names to new column names in this dictionary, you can rename specific columns during the process
  • column_clean: The column_clean parameter in the create_transcript_view_from_column_format function is a boolean flag that determines whether the transcripts infos columns should undergo a cleaning process. If set to True, the columns will be cleaned during the creation of the transcript view based on the specified column format, defaults to False
  • column_case: The column_case parameter in the create_transcript_view_from_column_format function is used to specify the case transformation to be applied to the columns in the transcript view. It can be set to either "upper" or "lower" to convert the column names to uppercase or lowercase, respectively
Returns

The create_transcript_view_from_column_format function returns two lists: temporary_tables and annotation_fields.

def create_transcript_view( self, transcripts_table: str = None, transcripts_table_drop: bool = True, param: dict = {}) -> str:
10773    def create_transcript_view(
10774        self,
10775        transcripts_table: str = None,
10776        transcripts_table_drop: bool = True,
10777        param: dict = {},
10778    ) -> str:
10779        """
10780        The `create_transcript_view` function generates a transcript view by processing data from a
10781        specified table based on provided parameters and structural information.
10782
10783        :param transcripts_table: The `transcripts_table` parameter in the `create_transcript_view` function
10784        is used to specify the name of the table that will store the final transcript view data. If a table
10785        name is not provided, the function will create a new table to store the transcript view data, and by
10786        default,, defaults to transcripts
10787        :type transcripts_table: str (optional)
10788        :param transcripts_table_drop: The `transcripts_table_drop` parameter in the
10789        `create_transcript_view` function is a boolean parameter that determines whether to drop the
10790        existing transcripts table before creating a new one. If `transcripts_table_drop` is set to `True`,
10791        the function will drop the existing transcripts table if it exists, defaults to True
10792        :type transcripts_table_drop: bool (optional)
10793        :param param: The `param` parameter in the `create_transcript_view` function is a dictionary that
10794        contains information needed to create a transcript view. It includes details such as the structure
10795        of the transcripts, columns mapping, column formats, and other necessary information for generating
10796        the view. This parameter allows for flexibility and customization
10797        :type param: dict
10798        :return: The `create_transcript_view` function returns the name of the transcripts table that was
10799        created or modified during the execution of the function.
10800        """
10801
10802        log.debug("Start transcripts view creation...")
10803
10804        # Default
10805        transcripts_table_default = "transcripts"
10806
10807        # Param
10808        if not param:
10809            param = self.get_param()
10810
10811        # Struct
10812        struct = param.get("transcripts", {}).get("struct", None)
10813
10814        # Transcript veresion
10815        transcript_id_remove_version = param.get("transcripts", {}).get(
10816            "transcript_id_remove_version", False
10817        )
10818
10819        # Transcripts mapping
10820        transcript_id_mapping_file = param.get("transcripts", {}).get(
10821            "transcript_id_mapping_file", None
10822        )
10823
10824        # Transcripts mapping
10825        transcript_id_mapping_force = param.get("transcripts", {}).get(
10826            "transcript_id_mapping_force", None
10827        )
10828
10829        if struct:
10830
10831            # Transcripts table
10832            if transcripts_table is None:
10833                transcripts_table = param.get("transcripts", {}).get(
10834                    "table", transcripts_table_default
10835                )
10836
10837            # added_columns
10838            added_columns = []
10839
10840            # Temporary tables
10841            temporary_tables = []
10842
10843            # Annotation fields
10844            annotation_fields = []
10845
10846            # from columns map
10847            columns_maps = struct.get("from_columns_map", [])
10848            added_columns_tmp, temporary_tables_tmp, annotation_fields_tmp = (
10849                self.create_transcript_view_from_columns_map(
10850                    transcripts_table=transcripts_table,
10851                    columns_maps=columns_maps,
10852                    added_columns=added_columns,
10853                    temporary_tables=temporary_tables,
10854                    annotation_fields=annotation_fields,
10855                )
10856            )
10857            added_columns += added_columns_tmp
10858            temporary_tables += temporary_tables_tmp
10859            annotation_fields += annotation_fields_tmp
10860
10861            # from column format
10862            column_formats = struct.get("from_column_format", [])
10863            temporary_tables_tmp, annotation_fields_tmp = (
10864                self.create_transcript_view_from_column_format(
10865                    transcripts_table=transcripts_table,
10866                    column_formats=column_formats,
10867                    temporary_tables=temporary_tables,
10868                    annotation_fields=annotation_fields,
10869                )
10870            )
10871            temporary_tables += temporary_tables_tmp
10872            annotation_fields += annotation_fields_tmp
10873
10874            # Remove some specific fields/column
10875            annotation_fields = list(set(annotation_fields))
10876            for field in ["#CHROM", "POS", "REF", "ALT", "INFO", "transcript"]:
10877                if field in annotation_fields:
10878                    annotation_fields.remove(field)
10879
10880            # Merge temporary tables query
10881            query_merge = ""
10882            for temporary_table in list(set(temporary_tables)):
10883
10884                # First temporary table
10885                if not query_merge:
10886                    query_merge = f"""
10887                        SELECT * FROM {temporary_table}
10888                    """
10889                # other temporary table (using UNION)
10890                else:
10891                    query_merge += f"""
10892                        UNION BY NAME SELECT * FROM {temporary_table}
10893                    """
10894
10895            # transcript table tmp
10896            transcript_table_tmp = "transcripts_tmp"
10897            transcript_table_tmp2 = "transcripts_tmp2"
10898            transcript_table_tmp3 = "transcripts_tmp3"
10899
10900            # Merge on transcript
10901            query_merge_on_transcripts_annotation_fields = []
10902
10903            # Add transcript list
10904            query_merge_on_transcripts_annotation_fields.append(
10905                f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.transcript)), 'string_agg', ',') AS transcript_list """
10906            )
10907
10908            # Aggregate all annotations fields
10909            for annotation_field in set(annotation_fields):
10910                query_merge_on_transcripts_annotation_fields.append(
10911                    f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.{annotation_field})), 'string_agg', ',') AS {annotation_field} """
10912                )
10913
10914            # Transcripts mapping
10915            if transcript_id_mapping_file:
10916
10917                # Transcript dataframe
10918                transcript_id_mapping_dataframe_name = "transcript_id_mapping_dataframe"
10919                transcript_id_mapping_dataframe = transcripts_file_to_df(
10920                    transcript_id_mapping_file, column_names=["transcript", "alias"]
10921                )
10922
10923                # Transcript version remove
10924                if transcript_id_remove_version:
10925                    query_transcript_column_select = f"split_part({transcript_table_tmp}.transcript, '.', 1) AS transcript_original, split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1) AS transcript_mapped"
10926                    query_transcript_column_group_by = f"split_part({transcript_table_tmp}.transcript, '.', 1), split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1)"
10927                    query_left_join = f"""
10928                        LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1))
10929                    """
10930                else:
10931                    query_transcript_column_select = f"{transcript_table_tmp}.transcript AS transcript_original, {transcript_id_mapping_dataframe_name}.transcript AS transcript_mapped"
10932                    query_transcript_column_group_by = f"{transcript_table_tmp}.transcript, {transcript_id_mapping_dataframe_name}.transcript"
10933                    query_left_join = f"""
10934                        LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1))
10935                    """
10936
10937                # Transcript column for group by merge
10938                query_transcript_merge_group_by = """
10939                        CASE
10940                            WHEN transcript_mapped NOT IN ('')
10941                            THEN split_part(transcript_mapped, '.', 1)
10942                            ELSE split_part(transcript_original, '.', 1)
10943                        END
10944                    """
10945
10946                # Merge query
10947                transcripts_tmp2_query = f"""
10948                    SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_select}, {", ".join(query_merge_on_transcripts_annotation_fields)}
10949                    FROM ({query_merge}) AS {transcript_table_tmp}
10950                    {query_left_join}
10951                    GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_group_by}
10952                """
10953
10954                # Retrive columns after mege
10955                transcripts_tmp2_describe_query = f"""
10956                    DESCRIBE {transcripts_tmp2_query}
10957                """
10958                transcripts_tmp2_describe_list = list(
10959                    self.get_query_to_df(query=transcripts_tmp2_describe_query)[
10960                        "column_name"
10961                    ]
10962                )
10963
10964                # Create list of columns for select clause
10965                transcripts_tmp2_describe_select_clause = []
10966                for field in transcripts_tmp2_describe_list:
10967                    if field not in [
10968                        "#CHROM",
10969                        "POS",
10970                        "REF",
10971                        "ALT",
10972                        "INFO",
10973                        "transcript_mapped",
10974                    ]:
10975                        as_field = field
10976                        if field in ["transcript_original"]:
10977                            as_field = "transcripts_mapped"
10978                        transcripts_tmp2_describe_select_clause.append(
10979                            f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp2}.{field})), 'string_agg', ',') AS {as_field} """
10980                        )
10981
10982                # Merge with mapping
10983                query_merge_on_transcripts = f"""
10984                    SELECT
10985                        "#CHROM", POS, REF, ALT, INFO,
10986                        CASE
10987                            WHEN ANY_VALUE(transcript_mapped) NOT IN ('')
10988                            THEN ANY_VALUE(transcript_mapped)
10989                            ELSE ANY_VALUE(transcript_original)
10990                        END AS transcript,
10991                        {", ".join(transcripts_tmp2_describe_select_clause)}
10992                    FROM ({transcripts_tmp2_query}) AS {transcript_table_tmp2}
10993                    GROUP BY "#CHROM", POS, REF, ALT, INFO,
10994                        {query_transcript_merge_group_by}
10995                """
10996
10997                # Add transcript filter from mapping file
10998                if transcript_id_mapping_force:
10999                    query_merge_on_transcripts = f"""
11000                        SELECT *
11001                        FROM ({query_merge_on_transcripts}) AS {transcript_table_tmp3}
11002                        WHERE split_part({transcript_table_tmp3}.transcript, '.', 1) in (SELECT split_part(transcript, '.', 1) FROM transcript_id_mapping_dataframe)
11003                    """
11004
11005            # No transcript mapping
11006            else:
11007
11008                # Remove transcript version
11009                if transcript_id_remove_version:
11010                    query_transcript_column = f"""
11011                        split_part({transcript_table_tmp}.transcript, '.', 1)
11012                    """
11013                else:
11014                    query_transcript_column = """
11015                        transcript
11016                    """
11017
11018                # Query sections
11019                query_transcript_column_select = (
11020                    f"{query_transcript_column} AS transcript"
11021                )
11022                query_transcript_column_group_by = query_transcript_column
11023
11024                # Query for transcripts view
11025                query_merge_on_transcripts = f"""
11026                    SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column} AS transcript, NULL AS transcript_mapped, {", ".join(query_merge_on_transcripts_annotation_fields)}
11027                    FROM ({query_merge}) AS {transcript_table_tmp}
11028                    GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column}
11029                """
11030
11031            log.debug(f"query_merge_on_transcripts={query_merge_on_transcripts}")
11032
11033            # Drop transcript view is necessary
11034            if transcripts_table_drop:
11035                query_drop = f"""
11036                    DROP TABLE IF EXISTS {transcripts_table};
11037                """
11038                self.execute_query(query=query_drop)
11039
11040            # Merge and create transcript view
11041            query_create_view = f"""
11042                CREATE TABLE IF NOT EXISTS {transcripts_table}
11043                AS {query_merge_on_transcripts}
11044            """
11045            self.execute_query(query=query_create_view)
11046
11047            # Remove added columns
11048            for added_column in added_columns:
11049                self.drop_column(column=added_column)
11050
11051        else:
11052
11053            transcripts_table = None
11054
11055        return transcripts_table

The create_transcript_view function generates a transcript view by processing data from a specified table based on provided parameters and structural information.

Parameters
  • transcripts_table: The transcripts_table parameter in the create_transcript_view function is used to specify the name of the table that will store the final transcript view data. If a table name is not provided, the function will create a new table to store the transcript view data, and by default,, defaults to transcripts
  • transcripts_table_drop: The transcripts_table_drop parameter in the create_transcript_view function is a boolean parameter that determines whether to drop the existing transcripts table before creating a new one. If transcripts_table_drop is set to True, the function will drop the existing transcripts table if it exists, defaults to True
  • param: The param parameter in the create_transcript_view function is a dictionary that contains information needed to create a transcript view. It includes details such as the structure of the transcripts, columns mapping, column formats, and other necessary information for generating the view. This parameter allows for flexibility and customization
Returns

The create_transcript_view function returns the name of the transcripts table that was created or modified during the execution of the function.

def annotation_format_to_table( self, uniquify: bool = True, annotation_field: str = 'ANN', annotation_id: str = 'Feature_ID', view_name: str = 'transcripts', column_rename: dict = {}, column_clean: bool = False, column_case: str = None) -> str:
11057    def annotation_format_to_table(
11058        self,
11059        uniquify: bool = True,
11060        annotation_field: str = "ANN",
11061        annotation_id: str = "Feature_ID",
11062        view_name: str = "transcripts",
11063        column_rename: dict = {},
11064        column_clean: bool = False,
11065        column_case: str = None,
11066    ) -> str:
11067        """
11068        The `annotation_format_to_table` function converts annotation data from a VCF file into a
11069        structured table format, ensuring unique values and creating a temporary table for further
11070        processing or analysis.
11071
11072        :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure
11073        unique values in the output or not. If set to `True`, the function will make sure that the
11074        output values are unique, defaults to True
11075        :type uniquify: bool (optional)
11076        :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file
11077        that contains the annotation information for each variant. This field is used to extract the
11078        annotation details for further processing in the function. By default, it is set to "ANN",
11079        defaults to ANN
11080        :type annotation_field: str (optional)
11081        :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method
11082        is used to specify the identifier for the annotation feature. This identifier will be used as a
11083        column name in the resulting table or view that is created based on the annotation data. It
11084        helps in uniquely identifying each annotation entry in the, defaults to Feature_ID
11085        :type annotation_id: str (optional)
11086        :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used
11087        to specify the name of the temporary table that will be created to store the transformed
11088        annotation data. This table will hold the extracted information from the annotation field in a
11089        structured format for further processing or analysis. By default,, defaults to transcripts
11090        :type view_name: str (optional)
11091        :param column_rename: The `column_rename` parameter in the `annotation_format_to_table` method
11092        is a dictionary that allows you to specify custom renaming for columns. By providing key-value
11093        pairs in this dictionary, you can rename specific columns in the resulting table or view that is
11094        created based on the annotation data. This feature enables
11095        :type column_rename: dict
11096        :param column_clean: The `column_clean` parameter in the `annotation_format_to_table` method is
11097        a boolean flag that determines whether the annotation field should undergo a cleaning process.
11098        If set to `True`, the function will clean the annotation field before further processing. This
11099        cleaning step may involve removing any unwanted characters, formatting inconsistencies, defaults
11100        to False
11101        :type column_clean: bool (optional)
11102        :param column_case: The `column_case` parameter in the `annotation_format_to_table` method is
11103        used to specify the case transformation to be applied to the column names extracted from the
11104        annotation data. It allows you to set the case of the column names to either lowercase or
11105        uppercase for consistency or other specific requirements during the conversion
11106        :type column_case: str
11107        :return: The function `annotation_format_to_table` is returning the name of the view created,
11108        which is stored in the variable `view_name`.
11109        """
11110
11111        # Annotation field
11112        annotation_format = "annotation_explode"
11113
11114        # Transcript annotation
11115        if column_rename:
11116            annotation_id = column_rename.get(annotation_id, annotation_id)
11117
11118        if column_clean:
11119            annotation_id = clean_annotation_field(annotation_id)
11120
11121        # Prefix
11122        prefix = self.get_explode_infos_prefix()
11123        if prefix:
11124            prefix = "INFO/"
11125
11126        # Annotation fields
11127        annotation_infos = prefix + annotation_field
11128        annotation_format_infos = prefix + annotation_format
11129
11130        # Variants table
11131        table_variants = self.get_table_variants()
11132
11133        # Header
11134        vcf_reader = self.get_header()
11135
11136        # Add columns
11137        added_columns = []
11138
11139        # Explode HGVS field in column
11140        added_columns += self.explode_infos(fields=[annotation_field])
11141
11142        if annotation_field in vcf_reader.infos:
11143
11144            # Extract ANN header
11145            ann_description = vcf_reader.infos[annotation_field].desc
11146            pattern = r"'(.+?)'"
11147            match = re.search(pattern, ann_description)
11148            if match:
11149                ann_header_match = match.group(1).split(" | ")
11150                ann_header = []
11151                ann_header_desc = {}
11152                for i in range(len(ann_header_match)):
11153                    ann_header_info = "".join(
11154                        char for char in ann_header_match[i] if char.isalnum()
11155                    )
11156                    ann_header.append(ann_header_info)
11157                    ann_header_desc[ann_header_info] = ann_header_match[i]
11158                if not ann_header_desc:
11159                    raise ValueError("Invalid header description format")
11160            else:
11161                raise ValueError("Invalid header description format")
11162
11163            # Create variant id
11164            variant_id_column = self.get_variant_id_column()
11165            added_columns += [variant_id_column]
11166
11167            # Create dataframe
11168            dataframe_annotation_format = self.get_query_to_df(
11169                f""" SELECT "#CHROM", POS, REF, ALT, INFO, "{variant_id_column}", "{annotation_infos}" FROM {table_variants} """
11170            )
11171
11172            # Create annotation columns
11173            dataframe_annotation_format[
11174                annotation_format_infos
11175            ] = dataframe_annotation_format[annotation_infos].apply(
11176                lambda x: explode_annotation_format(
11177                    annotation=str(x),
11178                    uniquify=uniquify,
11179                    output_format="JSON",
11180                    prefix="",
11181                    header=list(ann_header_desc.values()),
11182                )
11183            )
11184
11185            # Find keys
11186            query_json = f"""SELECT distinct(unnest(json_keys({annotation_format}, '$.0'))) AS 'key' FROM dataframe_annotation_format;"""
11187            df_keys = self.get_query_to_df(query=query_json)
11188
11189            # Check keys
11190            query_json_key = []
11191            for _, row in df_keys.iterrows():
11192
11193                # Key
11194                key = row.iloc[0]
11195                key_clean = key
11196
11197                # key rename
11198                if column_rename:
11199                    key_clean = column_rename.get(key_clean, key_clean)
11200
11201                # key clean
11202                if column_clean:
11203                    key_clean = clean_annotation_field(key_clean)
11204
11205                # Key case
11206                if column_case:
11207                    if column_case.lower() in ["lower"]:
11208                        key_clean = key_clean.lower()
11209                    elif column_case.lower() in ["upper"]:
11210                        key_clean = key_clean.upper()
11211
11212                # Type
11213                query_json_type = f"""SELECT unnest(json_extract_string({annotation_format}, '$.*."{key}"')) AS '{key_clean}' FROM dataframe_annotation_format WHERE trim('{key}') NOT IN ('');"""
11214
11215                # Get DataFrame from query
11216                df_json_type = self.get_query_to_df(query=query_json_type)
11217
11218                # Fill missing values with empty strings and then replace empty strings or None with NaN and drop rows with NaN
11219                with pd.option_context("future.no_silent_downcasting", True):
11220                    df_json_type.fillna(value="", inplace=True)
11221                    replace_dict = {None: np.nan, "": np.nan}
11222                    df_json_type.replace(replace_dict, inplace=True)
11223                    df_json_type.dropna(inplace=True)
11224
11225                # Detect column type
11226                column_type = detect_column_type(df_json_type[key_clean])
11227
11228                # Append
11229                query_json_key.append(
11230                    f"""NULLIF(unnest(json_extract_string({annotation_format}, '$.*."{key}"')), '')::{column_type}  AS '{prefix}{key_clean}' """
11231                )
11232
11233            # Create view
11234            query_view = f"""
11235                CREATE TEMPORARY TABLE {view_name}
11236                AS (
11237                    SELECT *, {annotation_id} AS 'transcript'
11238                    FROM (
11239                        SELECT "#CHROM", POS, REF, ALT, INFO, {",".join(query_json_key)}
11240                        FROM dataframe_annotation_format
11241                        )
11242                    );
11243            """
11244            self.execute_query(query=query_view)
11245
11246        else:
11247
11248            # Return None
11249            view_name = None
11250
11251        # Remove added columns
11252        for added_column in added_columns:
11253            self.drop_column(column=added_column)
11254
11255        return view_name

The annotation_format_to_table function converts annotation data from a VCF file into a structured table format, ensuring unique values and creating a temporary table for further processing or analysis.

Parameters
  • uniquify: The uniquify parameter is a boolean flag that determines whether to ensure unique values in the output or not. If set to True, the function will make sure that the output values are unique, defaults to True
  • annotation_field: The annotation_field parameter refers to the field in the VCF file that contains the annotation information for each variant. This field is used to extract the annotation details for further processing in the function. By default, it is set to "ANN", defaults to ANN
  • annotation_id: The annotation_id parameter in the annotation_format_to_table method is used to specify the identifier for the annotation feature. This identifier will be used as a column name in the resulting table or view that is created based on the annotation data. It helps in uniquely identifying each annotation entry in the, defaults to Feature_ID
  • view_name: The view_name parameter in the annotation_format_to_table method is used to specify the name of the temporary table that will be created to store the transformed annotation data. This table will hold the extracted information from the annotation field in a structured format for further processing or analysis. By default,, defaults to transcripts
  • column_rename: The column_rename parameter in the annotation_format_to_table method is a dictionary that allows you to specify custom renaming for columns. By providing key-value pairs in this dictionary, you can rename specific columns in the resulting table or view that is created based on the annotation data. This feature enables
  • column_clean: The column_clean parameter in the annotation_format_to_table method is a boolean flag that determines whether the annotation field should undergo a cleaning process. If set to True, the function will clean the annotation field before further processing. This cleaning step may involve removing any unwanted characters, formatting inconsistencies, defaults to False
  • column_case: The column_case parameter in the annotation_format_to_table method is used to specify the case transformation to be applied to the column names extracted from the annotation data. It allows you to set the case of the column names to either lowercase or uppercase for consistency or other specific requirements during the conversion
Returns

The function annotation_format_to_table is returning the name of the view created, which is stored in the variable view_name.

def transcript_view_to_variants( self, transcripts_table: str = None, transcripts_column_id: str = None, transcripts_info_json: str = None, transcripts_info_field_json: str = None, transcripts_info_format: str = None, transcripts_info_field_format: str = None, param: dict = {}) -> bool:
11257    def transcript_view_to_variants(
11258        self,
11259        transcripts_table: str = None,
11260        transcripts_column_id: str = None,
11261        transcripts_info_json: str = None,
11262        transcripts_info_field_json: str = None,
11263        transcripts_info_format: str = None,
11264        transcripts_info_field_format: str = None,
11265        param: dict = {},
11266    ) -> bool:
11267        """
11268        The `transcript_view_to_variants` function updates a variants table with information from
11269        transcripts in JSON format.
11270
11271        :param transcripts_table: The `transcripts_table` parameter is used to specify the name of the
11272        table containing the transcripts data. If this parameter is not provided, the function will
11273        attempt to retrieve it from the `param` dictionary or use a default value of "transcripts"
11274        :type transcripts_table: str
11275        :param transcripts_column_id: The `transcripts_column_id` parameter is used to specify the
11276        column in the `transcripts_table` that contains the unique identifier for each transcript. This
11277        identifier is used to match transcripts with variants in the database
11278        :type transcripts_column_id: str
11279        :param transcripts_info_json: The `transcripts_info_json` parameter is used to specify the name
11280        of the column in the variants table where the transcripts information will be stored in JSON
11281        format. This parameter allows you to define the column in the variants table that will hold the
11282        JSON-formatted information about transcripts
11283        :type transcripts_info_json: str
11284        :param transcripts_info_field_json: The `transcripts_info_field_json` parameter is used to
11285        specify the field in the VCF header that will contain information about transcripts in JSON
11286        format. This field will be added to the VCF header as an INFO field with the specified name
11287        :type transcripts_info_field_json: str
11288        :param transcripts_info_format: The `transcripts_info_format` parameter is used to specify the
11289        format of the information about transcripts that will be stored in the variants table. This
11290        format can be used to define how the transcript information will be structured or displayed
11291        within the variants table
11292        :type transcripts_info_format: str
11293        :param transcripts_info_field_format: The `transcripts_info_field_format` parameter is used to
11294        specify the field in the VCF header that will contain information about transcripts in a
11295        specific format. This field will be added to the VCF header as an INFO field with the specified
11296        name
11297        :type transcripts_info_field_format: str
11298        :param param: The `param` parameter in the `transcript_view_to_variants` method is a dictionary
11299        that contains various configuration settings related to transcripts. It is used to provide
11300        default values for certain parameters if they are not explicitly provided when calling the
11301        method. The `param` dictionary can be passed as an argument
11302        :type param: dict
11303        :return: The function `transcript_view_to_variants` returns a boolean value. It returns `True`
11304        if the operation is successful and `False` if certain conditions are not met.
11305        """
11306
11307        msg_info_prefix = "Start transcripts view to variants annotations"
11308
11309        log.debug(f"{msg_info_prefix}...")
11310
11311        # Default
11312        transcripts_table_default = "transcripts"
11313        transcripts_column_id_default = "transcript"
11314        transcripts_info_json_default = None
11315        transcripts_info_format_default = None
11316        transcripts_info_field_json_default = None
11317        transcripts_info_field_format_default = None
11318
11319        # Param
11320        if not param:
11321            param = self.get_param()
11322
11323        # Transcripts table
11324        if transcripts_table is None:
11325            transcripts_table = param.get("transcripts", {}).get(
11326                "table", transcripts_table_default
11327            )
11328
11329        # Transcripts column ID
11330        if transcripts_column_id is None:
11331            transcripts_column_id = param.get("transcripts", {}).get(
11332                "column_id", transcripts_column_id_default
11333            )
11334
11335        # Transcripts info json
11336        if transcripts_info_json is None:
11337            transcripts_info_json = param.get("transcripts", {}).get(
11338                "transcripts_info_json", transcripts_info_json_default
11339            )
11340
11341        # Transcripts info field JSON
11342        if transcripts_info_field_json is None:
11343            transcripts_info_field_json = param.get("transcripts", {}).get(
11344                "transcripts_info_field_json", transcripts_info_field_json_default
11345            )
11346        # if transcripts_info_field_json is not None and transcripts_info_json is None:
11347        #     transcripts_info_json = transcripts_info_field_json
11348
11349        # Transcripts info format
11350        if transcripts_info_format is None:
11351            transcripts_info_format = param.get("transcripts", {}).get(
11352                "transcripts_info_format", transcripts_info_format_default
11353            )
11354
11355        # Transcripts info field FORMAT
11356        if transcripts_info_field_format is None:
11357            transcripts_info_field_format = param.get("transcripts", {}).get(
11358                "transcripts_info_field_format", transcripts_info_field_format_default
11359            )
11360        # if (
11361        #     transcripts_info_field_format is not None
11362        #     and transcripts_info_format is None
11363        # ):
11364        #     transcripts_info_format = transcripts_info_field_format
11365
11366        # Variants table
11367        table_variants = self.get_table_variants()
11368
11369        # Check info columns param
11370        if (
11371            transcripts_info_json is None
11372            and transcripts_info_field_json is None
11373            and transcripts_info_format is None
11374            and transcripts_info_field_format is None
11375        ):
11376            return False
11377
11378        # Transcripts infos columns
11379        query_transcripts_infos_columns = f"""
11380            SELECT *
11381            FROM (
11382                DESCRIBE SELECT * FROM {transcripts_table}
11383                )
11384            WHERE "column_name" NOT IN ('#CHROM', 'POS', 'REF', 'ALT', '{transcripts_column_id}')
11385        """
11386        transcripts_infos_columns = list(
11387            self.get_query_to_df(query=query_transcripts_infos_columns)["column_name"]
11388        )
11389
11390        # View results
11391        clause_select = []
11392        clause_to_json = []
11393        clause_to_format = []
11394        for field in transcripts_infos_columns:
11395            # Do not consider INFO field for export into fields
11396            if field not in ["INFO"]:
11397                clause_select.append(
11398                    f""" regexp_split_to_table(CAST("{field}" AS STRING), ',') AS '{field}' """
11399                )
11400                clause_to_json.append(f""" '{field}': "{field}" """)
11401                clause_to_format.append(f""" "{field}" """)
11402
11403        # Update
11404        update_set_json = []
11405        update_set_format = []
11406
11407        # VCF header
11408        vcf_reader = self.get_header()
11409
11410        # Transcripts to info column in JSON
11411        if transcripts_info_json:
11412
11413            # Create column on variants table
11414            self.add_column(
11415                table_name=table_variants,
11416                column_name=transcripts_info_json,
11417                column_type="JSON",
11418                default_value=None,
11419                drop=False,
11420            )
11421
11422            # Add header
11423            vcf_reader.infos[transcripts_info_json] = vcf.parser._Info(
11424                transcripts_info_json,
11425                ".",
11426                "String",
11427                "Transcripts in JSON format",
11428                "unknwon",
11429                "unknwon",
11430                self.code_type_map["String"],
11431            )
11432
11433            # Add to update
11434            update_set_json.append(
11435                f""" {transcripts_info_json}=t.{transcripts_info_json} """
11436            )
11437
11438        # Transcripts to info field in JSON
11439        if transcripts_info_field_json:
11440
11441            log.debug(f"{msg_info_prefix} - Annotation in JSON format...")
11442
11443            # Add to update
11444            update_set_json.append(
11445                f""" 
11446                    INFO = concat(
11447                            CASE
11448                                WHEN INFO NOT IN ('', '.')
11449                                THEN INFO
11450                                ELSE ''
11451                            END,
11452                            CASE
11453                                WHEN CAST(t.{transcripts_info_json} AS VARCHAR) NOT IN ('', '.')
11454                                THEN concat(
11455                                    ';{transcripts_info_field_json}=',
11456                                    t.{transcripts_info_json}
11457                                )
11458                                ELSE ''
11459                            END
11460                            )
11461                """
11462            )
11463
11464            # Add header
11465            vcf_reader.infos[transcripts_info_field_json] = vcf.parser._Info(
11466                transcripts_info_field_json,
11467                ".",
11468                "String",
11469                "Transcripts in JSON format",
11470                "unknwon",
11471                "unknwon",
11472                self.code_type_map["String"],
11473            )
11474
11475        if update_set_json:
11476
11477            # Update query
11478            query_update = f"""
11479                UPDATE {table_variants}
11480                    SET {", ".join(update_set_json)}
11481                FROM
11482                (
11483                    SELECT
11484                        "#CHROM", POS, REF, ALT,
11485                            concat(
11486                            '{{',
11487                            string_agg(
11488                                '"' || "{transcripts_column_id}" || '":' ||
11489                                to_json(json_output)
11490                            ),
11491                            '}}'
11492                            )::JSON AS {transcripts_info_json}
11493                    FROM
11494                        (
11495                        SELECT
11496                            "#CHROM", POS, REF, ALT,
11497                            "{transcripts_column_id}",
11498                            to_json(
11499                                {{{",".join(clause_to_json)}}}
11500                            )::JSON AS json_output
11501                        FROM
11502                            (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table})
11503                        WHERE "{transcripts_column_id}" IS NOT NULL
11504                        )
11505                    GROUP BY "#CHROM", POS, REF, ALT
11506                ) AS t
11507                WHERE {table_variants}."#CHROM" = t."#CHROM"
11508                    AND {table_variants}."POS" = t."POS"
11509                    AND {table_variants}."REF" = t."REF"
11510                    AND {table_variants}."ALT" = t."ALT"
11511            """
11512
11513            self.execute_query(query=query_update)
11514
11515        # Transcripts to info column in FORMAT
11516        if transcripts_info_format:
11517
11518            # Create column on variants table
11519            self.add_column(
11520                table_name=table_variants,
11521                column_name=transcripts_info_format,
11522                column_type="VARCHAR",
11523                default_value=None,
11524                drop=False,
11525            )
11526
11527            # Add header
11528            vcf_reader.infos[transcripts_info_format] = vcf.parser._Info(
11529                transcripts_info_format,
11530                ".",
11531                "String",
11532                f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'",
11533                "unknwon",
11534                "unknwon",
11535                self.code_type_map["String"],
11536            )
11537
11538            # Add to update
11539            update_set_format.append(
11540                f""" {transcripts_info_format}=t.{transcripts_info_format} """
11541            )
11542
11543        else:
11544
11545            # Set variable for internal queries
11546            transcripts_info_format = "transcripts_info_format"
11547
11548        # Transcripts to info field in JSON
11549        if transcripts_info_field_format:
11550
11551            log.debug(f"{msg_info_prefix} - Annotation in structured format...")
11552
11553            # Add to update
11554            update_set_format.append(
11555                f""" 
11556                    INFO = concat(
11557                            CASE
11558                                WHEN INFO NOT IN ('', '.')
11559                                THEN INFO
11560                                ELSE ''
11561                            END,
11562                            CASE
11563                                WHEN CAST(t.{transcripts_info_format} AS VARCHAR) NOT IN ('', '.')
11564                                THEN concat(
11565                                    ';{transcripts_info_field_format}=',
11566                                    t.{transcripts_info_format}
11567                                )
11568                                ELSE ''
11569                            END
11570                            )
11571                """
11572            )
11573
11574            # Add header
11575            vcf_reader.infos[transcripts_info_field_format] = vcf.parser._Info(
11576                transcripts_info_field_format,
11577                ".",
11578                "String",
11579                f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'",
11580                "unknwon",
11581                "unknwon",
11582                self.code_type_map["String"],
11583            )
11584
11585        if update_set_format:
11586
11587            # Update query
11588            query_update = f"""
11589                UPDATE {table_variants}
11590                    SET {", ".join(update_set_format)}
11591                FROM
11592                (
11593                    SELECT
11594                        "#CHROM", POS, REF, ALT,
11595                            string_agg({transcripts_info_format}) AS {transcripts_info_format}
11596                    FROM 
11597                        (
11598                        SELECT
11599                            "#CHROM", POS, REF, ALT,
11600                            "{transcripts_column_id}",
11601                            concat(
11602                                "{transcripts_column_id}",
11603                                '|',
11604                                {", '|', ".join(clause_to_format)}
11605                            ) AS {transcripts_info_format}
11606                        FROM
11607                            (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table})
11608                        )
11609                    GROUP BY "#CHROM", POS, REF, ALT
11610                ) AS t
11611                WHERE {table_variants}."#CHROM" = t."#CHROM"
11612                    AND {table_variants}."POS" = t."POS"
11613                    AND {table_variants}."REF" = t."REF"
11614                    AND {table_variants}."ALT" = t."ALT"
11615            """
11616
11617            self.execute_query(query=query_update)
11618
11619        return True

The transcript_view_to_variants function updates a variants table with information from transcripts in JSON format.

Parameters
  • transcripts_table: The transcripts_table parameter is used to specify the name of the table containing the transcripts data. If this parameter is not provided, the function will attempt to retrieve it from the param dictionary or use a default value of "transcripts"
  • transcripts_column_id: The transcripts_column_id parameter is used to specify the column in the transcripts_table that contains the unique identifier for each transcript. This identifier is used to match transcripts with variants in the database
  • transcripts_info_json: The transcripts_info_json parameter is used to specify the name of the column in the variants table where the transcripts information will be stored in JSON format. This parameter allows you to define the column in the variants table that will hold the JSON-formatted information about transcripts
  • transcripts_info_field_json: The transcripts_info_field_json parameter is used to specify the field in the VCF header that will contain information about transcripts in JSON format. This field will be added to the VCF header as an INFO field with the specified name
  • transcripts_info_format: The transcripts_info_format parameter is used to specify the format of the information about transcripts that will be stored in the variants table. This format can be used to define how the transcript information will be structured or displayed within the variants table
  • transcripts_info_field_format: The transcripts_info_field_format parameter is used to specify the field in the VCF header that will contain information about transcripts in a specific format. This field will be added to the VCF header as an INFO field with the specified name
  • param: The param parameter in the transcript_view_to_variants method is a dictionary that contains various configuration settings related to transcripts. It is used to provide default values for certain parameters if they are not explicitly provided when calling the method. The param dictionary can be passed as an argument
Returns

The function transcript_view_to_variants returns a boolean value. It returns True if the operation is successful and False if certain conditions are not met.